赞
踩
- from sklearn import preprocessing
- le =preprocessing.LabelEncoder()
- le.fit(["paris", "paris", "tokyo", "amsterdam"])
- print('标签个数:%s'% le.classes_) # 标签个数:['amsterdam' 'paris' 'tokyo']
- print('标签值标准化:%s' % le.transform(["tokyo", "tokyo", "paris"])) # [2 2 1]
- print('标准化标签值反转:%s'%le.inverse_transform([2,2,1])) #['tokyo' 'tokyo' 'paris']
- from collections import Counter
- # 功能函数:对每一个user 分组数据进行统计
- behavior_type = group.type.astype(int) # 用户行为类别
- type_cnt = Counter(behavior_type) # 1、浏览 2、加购 3、删除 4、购买 5、收藏 6、点击
- group['browse_num'] = type_cnt[1]
- df_usr = pd.read_csv(USER_FILE, header = 0)
- df_usr = df_usr[['user_iddf_usrre', 'sex','user_lv_cd']]
data.info(): 查看数据的基本信息:(行数、列数、列索引、列类型、列非空值个数、内存占用)
data.describe(): 查看数据的统计信息:(总数,均值,标准差,最小值,最大值,分位数等)
将列表内部的表格合并: df_ac = pd.concat(chunks, ignore_index = True)
data.groupby([‘分组字段’]): 对DataFrame进行分组(可单类分组,可多类分组)
xgboost 建模
- dtrain = xgb.DMatrix(X_train, label=y_train)
- dvalid = xgb.DMatrix(X_val, label=y_val)
- param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0.1,
- 'subsample': 0.9,'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1,
- 'objective': 'binary:logistic','eval_metric':['auc','error']}
- num_round = param['n_estimators']
- evallist = [(dtrain, 'train'), (dvalid, 'eval')]
- bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)
- bst.save_model('bst.model')
根据已知用户购买、浏览数据,对用户未来的购买意向,进行预测。 提前知道用户购买意向,可以大大提升,电商平台对物流的掌控力度,提前备货。 对消费者也是一定好处,商品购买意向预测,相当于商品找消费者,实现个性化服务,消费者购物体验会大大提升~
主要分为以下几步:
- import pandas as pd
-
- df_user = pd.read_csv('./data/JData_User.csv')
- display(df_user.head())
- df_month3 = pd.read_csv('./data/JData_Action_201603.csv')
- display(df_month3.head())
- import gc # GC.Collect()其功能就是强制对所有代进行垃圾回收
- del df_user
- del df_month3
- gc.collect()
测试合并数据: pd.merge()
- import pandas as pd
- df1 = pd.DataFrame({'sku':['a','b','c','d'],'data':[1,1,2,3]})
- df2 = pd.DataFrame({'sku':['a','b','f'],'time':['+','-','*']})
- df3 = pd.DataFrame({'sku':['a','b','d']})
- df4 = pd.DataFrame({'sku':['a','b','c','d']})
-
- display(pd.merge(df1,df2))
- display(pd.merge(df1,df3))
- display(pd.merge(df1,df4))
- def user_action_id_check():
- df_user = pd.read_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User.csv')
- df_user = df_user.loc[:,'user_id'].to_frame()
- df_month2 = pd.read_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv')
- print('Is action of Feb. from User file?',len(df_month2) == len(pd.merge(df_user,df_month2)))
- df_month3 = pd.read_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603.csv')
- print('Is action of Feb. from User file?',len(df_month3) == len(pd.merge(df_user,df_month3)))
- df_month4 = pd.read_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604.csv')
- print('Is action of Feb. from User file?',len(df_month4) == len(pd.merge(df_user,df_month4)))
- del df_user,df_month2,df_month3,df_month4
- gc.collect()
- user_action_id_check()
- '''Is action of Feb. from User file? True
- Is action of Feb. from User file? True
- Is action of Feb. from User file? True'''
- def deduplicate(filepath, filename, newpath):
- df_file = pd.read_csv(filepath)
- before = df_file.shape[0]
- df_file.drop_duplicates(inplace =True)
- after = df_file.shape[0]
- n_dup = before -after
- if n_dup != 0:
- print('No. of duplicate records for ' + filename + 'is:' + str(n_dup))
- df_file.to_csv(newpath, index = None)
- else:
- print('No duplicate records in' + filename)
- del df_file
- gc.collect()
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv','Feb. action',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602_dedup.csv')
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603.csv','Mar. action',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603_dedup.csv')
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604.csv','Apr. action',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604_dedup.csv')
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Comment.csv','Comment',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Comment_dedup.csv')
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Product.csv','Product',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Product_dedup.csv')
- deduplicate('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User.csv','User',
- 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User_dedup.csv')
- df_month3 = pd.read_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv')
- IsDuplicated = df_month3.duplicated()
- df_d = df_month3[IsDuplicated]
- # 发现重复数据大多数都是由于浏览(1),或者点击(6)产生
-
- display(df_d.groupby('type').count())
- # display(df_d.groupbyoupby('type'),count())
- del df_month3,df_d
- gc.collect()
- # 定义文件名
- ACTION_201602_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv'
- ACTION_201603_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603.csv'
- ACTION_201604_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604.csv'
-
- COMMENT_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Comment.csv'
- PRODUCT_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Product.csv'
- USER_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User.csv'
- USER_TABLE_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/User_table.csv'
- # 导入相应的包
- import pandas as pd
- import numpy as np
- from collections import Counter
- # 功能函数:对每一个user 分组数据进行统计
- def add_type_count(group):
- behavior_type = group.type.astype(int)
- # 用户行为类别
- type_cnt = Counter(behavior_type)
- # 1、浏览 2、加购 3、删除
- # 4、购买 5、收藏 6、点击
- group['browse_num'] = type_cnt[1]
- group['addcart_num'] = type_cnt[2]
- group['delcart_num'] = type_cnt[3]
-
- group['buy_num'] = type_cnt[4]
- group['favor_num'] = type_cnt[5]
- group['click_num'] = type_cnt[6]
-
- return group[['user_id','browse_num','addcart_num',
- 'delcart_num','buy_num','favor_num',
- 'click_num']]
- # 对action数据进行统计
- # 根据自己调节chunk_size大小
- def get_from_action_data(fname, chunk_size = 50000):
- reader = pd.read_csv(fname, header =0, iterator= True)
- chunks =[] # 块
- loop = True # 循环
- while loop:
- try:
- # 只读取user_id和 type两个字段
- chunk = reader.get_chunk(chunk_size)[['user_id','type']]
- chunks.append(chunk)
- except StopIteration:
- loop = False
- print('Iteration is stopped') # Iteration 迭代
- # 将块拼接为pandas dataframe格式
- df_ac = pd.concat(chunks,ignore_index = True)
- # 按user_id 分组,对每组进行统计
- df_ac = df_ac.groupby(['user_id'], as_index = False).apply(add_type_count)
- # 将重复行丢弃
- df_ac = df_ac.drop_duplicates('user_id')
- return df_ac
- df_ac = get_from_action_data(fname = ACTION_201602_FILE,
- chunk_size = 50000)
- display(df_ac.head(10))
- del df_ac
- gc.collect()
- # 将各个action 数据的统计量进行聚合
- def merge_action_data():
- df_ac = []
- df_ac.append(get_from_action_data(fname =ACTION_201602_FILE))
- df_ac.append(get_from_action_data(fname =ACTION_201603_FILE))
- df_ac.append(get_from_action_data(fname =ACTION_201604_FILE))
-
- df_ac = pd.concat(df_ac, ignore_index= True)
- # 用户在不同的action 表中统计量求和
- df_ac = df_ac.groupby(['user_id'], as_index= False).sum()
- # 构造转化率字段
- df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']
- df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']
- df_ac['buy_click_ratio'] = df_ac['buy_num'] /df_ac['click_num']
- df_ac['buy_favor_ratio'] = df_ac['buy_num'] /df_ac['favor_num']
-
- # 将大于1的转化率字段置为1(100%)
- df_ac.loc[df_ac['buy_addcart_ratio'] > 1., 'buy_addcart_ratio'] = 1
- df_ac.loc[df_ac['buy_browse_ratio'] > 1., 'buy_browse_ratio'] = 1
- df_ac.loc[df_ac['buy_click_ratio'] > 1., 'buy_click_ratio'] = 1
- df_ac.loc[df_ac['buy_favor_ratio'] > 1., 'buy_favor_ratio'] = 1
-
- return df_ac
- user_behavior =merge_action_data()
- user_behavior.head()
从JData_User 表中抽取需要的字段
- def get_from_jdata_user():
- df_usr = pd.read_csv(USER_FILE, header = 0)
- df_usr = df_usr[['user_iddf_usrre', 'sex','user_lv_cd']]
- return df_usr
-
- user_base =get_from_jdata_user()
- user_base.head()
- # 连成一张表,类似于SQL的左连接(left join)
- user_table = pd.merge(user_base, user_behavior, on = ['user_id'], how = 'left')
- # 保存为user_table.csv
- user_table.to_csv(USER_TABLE_FILE,index = False)
- display(user_table.head(10))
- del user_table, user_behavior, user_base
- gc.collect()
- import pandas as pd
- df_user = pd.read_csv(USER_TABLE_FILE, header =0)
- # pd.options.display.float_format = '{:,0.3f}'.format # 输出格式设置,保留三位小数
-
- pd.options.display.float_format = '{:,.3f}'.format
- df_user.shape # (105321, 14)
- df_user.describe()
df_user[df_user['age'].isnull()]
- delete_list = df_user[df_user['age'].isnull()].index
- df_user.drop(delete_list,axis= 0,inplace= True)
- df_user.shape # (105318, 14)
- cond = (df_user['browse_num'].isnull())& (df_user['addcart_num'].isnull())& (df_user['delcart_num'].isnull()) &(df_user['buy_num'].isnull())& (df_user['favor_num'].isnull())&(df_user['click_num'].isnull())
- df_naction = df_user[cond]
- display(df_naction.shape) # (105177, 14)
- df_user.drop(df_naction.index, axis = 0, inplace = True)
- df_user.shape #(141, 14)
- # 统计无购买记录的用户
- df_bzero = df_user[df_user['buy_num'] == 0]
- # 输出购买数为0 的总记录数
- print(len(df_bzero))
- # 删除无购买记录的用户
- df_user = df_user[df_user['buy_num'] !=0]
- df_user.describe()
- bindex = df_user[df_user['buy_browse_ratio'] < 0.0005].index
- print(len(bindex)) # 90
- df_user.drop(bindex ,axis= 0,inplace= True)
- bindex = df_user[df_user['buy_click_ratio'] < 0.0005].index
- print(len(bindex)) # 323
- df_user.drop(bindex ,axis= 0,inplace= True)
- df_user.to_csv('G:/01-project/07-机器学习/08-京东购买意向预测/data/User_table_cleaned.csv', index =False)
- df_user.describe()
- del df_user
- gc.collect() # 垃圾回收,清空
- # 导入相关包
- %matplotlib inline
- # 绘图包
- import matplotlib
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
-
- # 定义文件名
- ACTION_201602_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv'
- ACTION_201603_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603.csv'
- ACTION_201604_FILE ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604.csv'
-
- COMMENT_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Comment.csv'
- PRODUCT_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Product.csv'
- USER_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User.csv'
- USER_TABLE_FILE = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/User_table.csv'
- USER_TABLE_CLEANED = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/User_table_cleaned.csv'
- # 提取购买的行为数据
- def get_from_action_data(fname,chunk_size = 50000):
- reader = pd.read_csv(fname,header = 0, iterator= True)
- chunks =[]
- loop = True
- while loop:
- try:
- chunk = reader.get_chunk(chunk_size)[['user_id','sku_id','type','time']]
- chunks.append(chunk)
- except StopIteration:
- loop = False
- print('Iteration is stopped')
-
- df_ac = pd.concat(chunks, ignore_index = True)
- # type = 4,购买
- df_ac = df_ac[df_ac['type'] ==4]
-
- return df_ac[['user_id', 'sku_id', 'time']]
- df_ac = []
- df_ac.append(get_from_action_data(fname= ACTION_201602_FILE))
- df_ac.append(get_from_action_data(fname= ACTION_201603_FILE))
- df_ac.append(get_from_action_data(fname= ACTION_201604_FILE))
- df_ac =pd.concat(df_ac, ignore_index= True)
- display(df_ac.head(), df_ac.shape)
- # 将time 字段转换为 datetime 类型
- df_ac['time'] = pd.to_datetime(df_ac['time'])
-
- # 使用 lambda匿名函数将时间 time转换为星期(周一为1,周日为7)
- df_ac['time'] = df_ac['time'].apply(lambda x :x.weekday()+1)
- df_ac.head()
- # 周一到周日每天购买用户个数
- df_user = df_ac.groupby('time')['user_id'].nunique()
- df_user = df_user.to_frame().reset_index()
- df_user.columns = ['weekday', 'user_num']
- df_user
- # 周一到周天每天购买商品个数
- df_item = df_ac.groupby('time')['sku_id'].nunique()
- df_item =df_item.to_frame().reset_index()
- df_item.columns = ['weekday', 'item_num']
- df_item
- # 周一到周日每天购买记录个数
- df_ui = df_ac.groupby('time', as_index=False).size()
- # df_ui = df_ui.to_frame().reset_index()
- df_ui.columns = ['weekday', 'user_item_num']
- df_ui
- # 条形宽度
- bar_width = 0.2
- # 透明度
- opacity = 0.4
- plt.figure(figsize=(9,6))
- plt.bar(df_user['weekday'], df_user['user_num'], bar_width,
- alpha=opacity, color='c', label='user')
- plt.bar(df_item['weekday']+bar_width, df_item['item_num'],
- bar_width, alpha=opacity, color='g', label='item')
- plt.bar(df_ui['weekday']+bar_width*2, df_ui['user_item_num'],
- bar_width, alpha=opacity, color='m', label='user_item')
-
- plt.xlabel('weekday')
- plt.ylabel('number')
- plt.title('A Week Purchase Table')
- plt.xticks(df_user['weekday'] + bar_width * 3 / 2., (1,2,3,4,5,6,7))
-
- plt.tight_layout() # 紧凑布局
- plt.legend(prop={'size':10})
- plt.savefig('./10-周购买情况数据可视化.png',dpi = 200)
- df_ac = get_from_action_data(fname = ACTION_201602_FILE)
- df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x:x.day)
- df_ac.head()
- # 每天购买用户个数
- df_user = df_ac.groupby('time')['user_id'].nunique()
- df_user = df_user.to_frame().reset_index()
- df_user.columns = ['day', 'user_num']
- # 每天购买用商品数
- df_item = df_ac.groupby('time')['sku_id'].nunique()
- df_item = df_item.to_frame().reset_index()
- df_item.columns = ['day', 'item_num']
- # 每天购买记录个数
- df_ui = df_ac.groupby('time', as_index=False).size()
- df_ui.columns = ['day', 'user_item_num']
- df_ui
- # 条形宽度
- bar_width = 0.2
- # 透明度
- opacity = 0.4
- # 天数
- day_range = range(1,len(df_user['day']) +1)
- # 设置图片大小
- plt.figure(figsize= (14,10))
-
- plt.bar(df_user['day'], df_user['user_num'], bar_width,
- alpha = opacity, color = 'c', label = 'user')
- plt.bar(df_item['day'] + bar_width, df_item['item_num'],
- bar_width, alpha =opacity, color = 'g', label ='item')
- plt.bar(df_ui['day'] +bar_width *2, df_ui['user_item_num'],
- bar_width, alpha = opacity, color = 'm', label ='user_item')
- plt.xlabel('day')
- plt.ylabel('number')
- plt.title('February Purchase Table')
- plt.xticks(df_user['day'] + bar_width *3 /2, day_range)
- plt.tight_layout()
- plt.legend(prop = {'size' :9})
- plt.savefig('./11-2月购买情况可视化.png',dpi =200)
- df_ac = get_from_action_data(fname=ACTION_201603_FILE)
- # 将time字段转换为datetime类型并使用lambda匿名函数将时间time转换为天
- df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x: x.day)
- df_user = df_ac.groupby('time')['user_id'].nunique()
-
- df_user = df_user.to_frame().reset_index()
- df_user.columns = ['day', 'user_num']
- display(df_user)
-
- df_item = df_ac.groupby('time')['sku_id'].nunique()
- df_item = df_item.to_frame().reset_index()
- df_item.columns = ['day', 'item_num']
- display(df_item)
-
- df_ui = df_ac.groupby('time', as_index=False).size()
- df_ui.columns = ['day', 'user_item_num']
- display(df_ui)
- # 条形宽度
- bar_width = 0.2
- # 透明度
- opacity = 0.4
- # 天数
- day_range = range(1,len(df_user['day']) + 1, 1)
- # 设置图片大小
- plt.figure(figsize=(14,10))
-
- plt.bar(df_user['day'], df_user['user_num'], bar_width,
- alpha=opacity, color='c', label='user')
- plt.bar(df_item['day']+bar_width, df_item['item_num'],
- bar_width, alpha=opacity, color='g', label='item')
- plt.bar(df_ui['day']+bar_width*2, df_ui['user_item_num'],
- bar_width, alpha=opacity, color='m', label='user_item')
-
- plt.xlabel('day')
- plt.ylabel('number')
- plt.title('March Purchase Table')
- plt.xticks(df_user['day'] + bar_width * 3 / 2., day_range)
- plt.tight_layout()
- plt.legend(prop={'size':9})
- plt.savefig('./12-3月购买情况可视化.png',dpi = 200)
- df_ac = get_from_action_data(fname = ACTION_201604_FILE)
-
- # 将time 字段转换为 datatime 类型并使用 lambda 匿名函数将时间time 转换为天
- df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x :x.day)
-
- df_user = df_ac.groupby('time')['user_id'].nunique()
- df_user = df_user.to_frame().reset_index()
- df_user.columns = ['day', 'user_num']
-
- df_item = df_ac.groupby('time')['sku_id'].nunique()
- df_item = df_item.to_frame().reset_index()
- df_item.columns = ['day', 'item_num']
-
- df_ui = df_ac.groupby('time', as_index= False).size()
- df_ui.columns = ['day', 'user_item_num']
- bar_width = 0.2
- opacity = 0.4
-
- day_range = range(1, len(df_user['day']) +1, 1)
- plt.figure(figsize= (14, 10))
-
- plt.bar(df_user['day'], df_user['user_num'], bar_width,
- alpha = opacity, color = 'c', label = 'user')
- plt.bar(df_item['day'] + bar_width, df_item['item_num'],
- bar_width ,alpha = opacity, color = 'g', label = 'item')
- plt.bar(df_ui['day'] + bar_width *2, df_ui['user_item_num'],
- bar_width, alpha = opacity, color = 'm', label = 'user_item')
-
- plt.xlabel('day')
- plt.ylabel('number')
- plt.title('April Purchase Table')
- plt.xticks(df_user['day'] + bar_width * 3 /2, day_range)
-
- plt.tight_layout()
- plt.legend(prop = {'size':9})
- plt.savefig('./14-4月购买情况可视化.png', dpi = 200)
- # 从行为记录中提取商品类别数据
- def get_from_action_data(fname,chunk_size = 50000):
- reader = pd.read_csv(fname,header = 0, iterator= True)
- chunks =[]
- loop = True
- while loop:
- try:
- chunk = reader.get_chunk(chunk_size)[['cate', 'brand', 'type','time']]
- chunks.append(chunk)
- except StopIteration:
- loop = False
- print('Iteration is stopped')
-
- df_ac = pd.concat(chunks, ignore_index = True)
- # type = 4,购买
- df_ac = df_ac[df_ac['type'] ==4]
-
- return df_ac[['cate', 'brand', 'type','time']]
- df_ac = []
- df_ac.append(get_from_action_data(fname= ACTION_201602_FILE))
- df_ac.append(get_from_action_data(fname= ACTION_201603_FILE))
- df_ac.append(get_from_action_data(fname= ACTION_201604_FILE))
- df_ac = pd.concat(df_ac, ignore_index= True)
-
- # 将time 字段转换为datatime 类型
- df_ac['time'] = pd.to_datetime(df_ac['time'])
- # 使用lambda 匿名函数将时间 time 转换为星期(周一为1,周日为7)
- df_ac['time'] = df_ac['time'].apply(lambda x: x.weekday() + 1)
- # 观察有几个类别商品
- df_ac.groupby(df_ac['cate']).count()
查看系统字体
- # 查找电脑上的中文字体
- from matplotlib.font_manager import FontManager
- fm = FontManager()
- [font.name for font in fm.ttflist]
- # 周一到周天每天购买商品类别数量统计
- plt.rcParams['font.family'] = 'STKaiti'
- plt.rcParams['font.size'] = 25
-
- df_product = df_ac['brand'].groupby([df_ac['time'], df_ac['cate']]).count()
- df_product = df_product.unstack()
-
- df_product.plot(kind = 'bar', figsize = (14, 10))
- plt.title(label= '不同商品周销量表',pad = 20)
- plt.savefig('./16 -不同商品周销量表.png',dpi = 200)
- df_product = df_ac['brand'].groupby([df_ac['time'], df_ac['cate']]).count()
- df_product.head(10)
- df_ac2 = get_from_action_data(fname= ACTION_201602_FILE)
- # 将time字段 转换为datatime
- df_ac2['time'] = pd.to_datetime(df_ac2['time']).apply(lambda x : x.day)
-
- df_ac3 = get_from_action_data(fname= ACTION_201603_FILE)
- # 将time字段 转换为datatime
- df_ac3['time'] = pd.to_datetime(df_ac3['time']).apply(lambda x : x.day)
-
- df_ac4 = get_from_action_data(fname= ACTION_201604_FILE)
- # 将time字段 转换为datatime
- df_ac4['time'] = pd.to_datetime(df_ac4['time']).apply(lambda x : x.day)
- dc_cate2 = df_ac2[df_ac2['cate'] == 8]
- dc_cate2 = dc_cate2['brand'].groupby(dc_cate2['time']).count()
- display(dc_cate2.head())
- dc_cate2 = dc_cate2.to_frame().reset_index() # 重置索引
- display(dc_cate2.head())
- dc_cate2.columns = ['day', 'product_num']
-
- dc_cate3 =df_ac3[df_ac3['cate'] == 8]
- dc_cate3 = dc_cate3['brand'].groupby(dc_cate3['time']).count()
- dc_cate3 = dc_cate3.to_frame().reset_index()
- dc_cate3.columns = ['day', 'product_num']
-
- dc_cate4 =df_ac4[df_ac4['cate'] == 8]
- dc_cate4 = dc_cate4['brand'].groupby(dc_cate4['time']).count()
- dc_cate4 = dc_cate4.to_frame().reset_index()
- dc_cate4.columns = ['day', 'product_num']
- # 条形宽度
- bar_width = 0.2
- # 透明度
- opacity = 0.4
- # 天数
- day_range = range(1, len(dc_cate3['day']) +1 ,1)
- plt.rcParams['font.family'] = 'STKaiti'
- plt.rcParams['font.size'] = 25
- # 设置图片大小
- plt.figure(figsize= (14,10))
-
- plt.bar(dc_cate2['day'], dc_cate2['product_num'], bar_width,
- alpha = opacity, color = 'c', label = 'February')
- plt.bar(dc_cate3['day'] + bar_width, dc_cate3['product_num'],
- bar_width, alpha = opacity, color = 'g', label = 'March')
- plt.bar(dc_cate4['day']+bar_width*2, dc_cate4['product_num'],
- bar_width, alpha=opacity, color='m', label='April')
- # plt.bar(dc_cate4['day'] + bar_width * 2, dc_cate4['product_num'],
- # bar_width, alpaha = opacity, color = 'm', label = 'April')
-
- plt.xlabel('day')
- plt.ylabel('number')
- plt.title('商品8 销量统计表', pad = 20)
- plt.xticks(dc_cate3['day'] + bar_width * 3 /2, day_range)
- plt.tight_layout()
- plt.legend(prop={'size':9})
- plt.savefig('./17-商品8每月按天统计销量可视化.png',dpi = 200)
- def spec_ui_action_data(fname, user_id, item_id, chunk_size = 100000):
- reader = pd.read_csv(fname, header = 0,iterator = True)
- chunks = []
- loop = True
- while loop:
- try:
- chunk = reader.get_chunk(chunk_size)[['user_id','sku_id','type','time']]
- chunks.append(chunk)
- except StopIteration:
- loop = False
- print('Iteration is stopped')
-
- df_ac = pd.concat(chunks, ignore_index = True)
- df_ac = df_ac[(df_ac['user_id'] == user_id) & (df_ac['sku_id'] == item_id)]
-
- return df_ac
- def explore_user_item_via_time():
- user_id = 266079
- item_id = 138778
- df_ac = []
- df_ac.append(spec_ui_action_data(ACTION_201602_FILE,user_id, item_id))
- df_ac.append(spec_ui_action_data(ACTION_201603_FILE,user_id, item_id))
- df_ac.append(spec_ui_action_data(ACTION_201604_FILE,user_id, item_id))
-
- df_ac = pd.concat(df_ac, ignore_index = False)
- print(df_ac.sort_values(by='time'))
explore_user_item_via_time()
导包
- from datetime import datetime
- from datetime import timedelta
- import pandas as pd
- import numpy as np
- import gc
变量声明
- action_1_path ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201602.csv'
- action_2_path ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201603.csv'
- action_3_path ='G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Action_201604.csv'
-
- comment_path = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Comment.csv'
- product_path = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_Product.csv'
- user_path = 'G:/01-project/07-机器学习/08-京东购买意向预测/data/JData_User.csv'
定义函数提取数据
- def get_actions_1():
- action = pd.read_csv(action_1_path)
- action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
- return action
- def get_actions_2():
- action = pd.read_csv(action_2_path)
- action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
- return action
- def get_actions_3():
- action = pd.read_csv(action_3_path)
- action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
- return action
-
- # 读取并拼接所有行为记录文件
- def get_all_action():
- action_1 = get_actions_1()
- action_2 = get_actions_2()
- action_3 = get_actions_3()
- actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame
- return actions
-
- # 获取某个时间段的行为记录
- def get_actions(start_date, end_date, all_actions):
- """
- :param start_date:开始日期
- :param end_date:结束日期
- :return: actions: 返回数据
- """
- actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()
- return actions
- from sklearn import preprocessing
- le = preprocessing.LabelEncoder()
- le.fit(["paris", "paris", "tokyo", "amsterdam"])
- le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
- # array([1, 1, 2, 0], dtype=int64)
- user = pd.read_csv(user_path)
- display(user.head())
- le = preprocessing.LabelEncoder()
- age_df = le.fit_transform(user['age']) # numpy数组
- display(age_df[:5])
- del user,age_df
- gc.collect()
- from sklearn import preprocessing
-
- def get_basic_user_feat():
- # 针对年龄的中文字符问题处理,首先是读入的时候编码,填充空值,然后将其数值化,最后独热编码,此外对于sex也进行了数值类型转换
- user = pd.read_csv(user_path)
- user.dropna(axis=0, how='any',inplace=True)
- user['sex'] = user['sex'].astype(int)
- user['age'] = user['age'].astype(int)
- le = preprocessing.LabelEncoder()
- age_df = le.fit_transform(user['age'])
-
- age_df = pd.get_dummies(age_df, prefix='age') # 独热编码
- sex_df = pd.get_dummies(user['sex'], prefix='sex')
- user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')
- user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
- return user
preprocessing.LabelEncoder():标准化标签,将标签值统一转换成range(len(data)-1)范围内.
- from sklearn import preprocessing
- le =preprocessing.LabelEncoder()
- le.fit(["paris", "paris", "tokyo", "amsterdam"])
- print('标签个数:%s'% le.classes_) # 标签个数:['amsterdam' 'paris' 'tokyo']
- print('标签值标准化:%s' % le.transform(["tokyo", "tokyo", "paris"])) # [2 2 1]
- print('标准化标签值反转:%s'%le.inverse_transform([2,2,1])) #['tokyo' 'tokyo' 'paris']
- user = get_basic_user_feat()
- display(user.head())
- del user
- gc.collect()
根据商品文件获取基本的特征,针对属性a1,a2,a3进行独热编码,商品类别和品牌直接作为特征 .
- def get_basic_product_feat():
- product = pd.read_csv(product_path)
- attr1_df = pd.get_dummies(product["a1"], prefix="a1")
- attr2_df = pd.get_dummies(product["a2"], prefix="a2")
- attr3_df = pd.get_dummies(product["a3"], prefix="a3")
- product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df,
- attr2_df, attr3_df], axis=1)
- return product
- def get_comments_product_feat(end_date):
- comments = pd.read_csv(comment_path)
- comments = comments[comments.dt <= end_date]# 某日期之前的评论数据
- df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
- # 为了防止某个时间段不具备评论数为0的情况(测试集出现过这种情况)
- for i in range(0, 5):
- if 'comment_num_' + str(i) not in df.columns:
- df['comment_num_' + str(i)] = 0
- df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]
-
- comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame
-
- comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1',
- 'comment_num_2', 'comment_num_3', 'comment_num_4']]
- return comments
评论数据转换
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
-
- display(start_date)
- comments = get_comments_product_feat(end_date)
- display(comments.head(),comments.shape)
- del comments
- gc.collect()
3.6.1.1 函数定义
- def get_action_feat(start_date, end_date, all_actions, day):
- actions = get_actions(start_date, end_date, all_actions)
- actions = actions[['user_id', 'sku_id', 'cate','type']]
- # 对行为类别进行独热编码
- prefix = 'action_before_%s' % day
- df = pd.get_dummies(actions['type'], prefix=prefix)
- actions = pd.concat([actions, df], axis=1)
-
- # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数
- actions = actions.groupby(['user_id', 'cate','sku_id'], as_index=False).sum()
- # 分组统计,用户-类别,不同用户对不同商品类别的行为计数
- user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()
- del user_cate['sku_id']
- del user_cate['type']
- # 数据合并
- actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])
-
- # 前述两种分组含有相同名称的不同行为的计数,系统会自动针对名称调整添加后缀,x,y
- # 所以这里作差统计的是同一类别下其他商品的行为计数
- # 不同时间累积的行为计数(3,5,7,10,15,21,30)表示时间间隔天数
- actions[prefix+'_1.0_y'] = actions[prefix+'_1.0_y'] - actions[prefix+'_1.0_x']
- actions[prefix+'_2.0_y'] = actions[prefix+'_2.0_y'] - actions[prefix+'_2.0_x']
- actions[prefix+'_3.0_y'] = actions[prefix+'_3.0_y'] - actions[prefix+'_3.0_x']
- actions[prefix+'_4.0_y'] = actions[prefix+'_4.0_y'] - actions[prefix+'_4.0_x']
- actions[prefix+'_5.0_y'] = actions[prefix+'_5.0_y'] - actions[prefix+'_5.0_x']
- actions[prefix+'_6.0_y'] = actions[prefix+'_6.0_y'] - actions[prefix+'_6.0_x']
-
-
- # 统计用户对不同类别下商品计数与该类别下商品行为计数均值(对时间)的差值
- actions[prefix+'minus_mean_1'] = actions[prefix+'_1.0_x'] - (actions[prefix+'_1.0_x']/day)
- actions[prefix+'minus_mean_2'] = actions[prefix+'_2.0_x'] - (actions[prefix+'_2.0_x']/day)
- actions[prefix+'minus_mean_3'] = actions[prefix+'_3.0_x'] - (actions[prefix+'_3.0_x']/day)
- actions[prefix+'minus_mean_4'] = actions[prefix+'_4.0_x'] - (actions[prefix+'_4.0_x']/day)
- actions[prefix+'minus_mean_5'] = actions[prefix+'_5.0_x'] - (actions[prefix+'_5.0_x']/day)
- actions[prefix+'minus_mean_6'] = actions[prefix+'_6.0_x'] - (actions[prefix+'_6.0_x']/day)
- del actions['type']
- return actions
3.6.1.2 代码解读
加载一定时间段内所有数据
- all_actions = get_all_action()
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
- # 获取一定时间段内数据
- actions = get_actions(start_date, end_date, all_actions)
- display(actions.head(),actions.shape)
- del all_actions
- gc.collect()
分组统计用户-类别-商品, 不同用户对不同类别下商品的行为计数
- # 提取部分特征
- actions = actions[['user_id', 'sku_id', 'cate','type']]
- # 对行为类别进行独热编码
- df = pd.get_dummies(actions['type'], prefix='action_before_%s' %3)
- display(df.head())
-
- # 数据合并
- actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
- display(actions.head(),actions.shape)
- del df
- gc.collect()
-
- # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数
- actions = actions.groupby(['user_id', 'cate','sku_id'], as_index=False).sum()
- display(actions.head(), actions.shape)
简单代码演示groupby, data.groupby() # 分组聚合
- import pandas as pd
- df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'],
- 'price': [12,12,12,15,15,17],
- 'num':[2,1,1,4,2,2]})
- display(df)
- display(df.groupby('books',as_index=True).sum())
- display(df.groupby('books',as_index=False).sum())
分组统计用户-类别,不同用户对不同商品类别的行为计数
- # 分组统计,用户-类别,不同用户对不同商品类别的行为计数
- user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()
- del user_cate['sku_id']
- del user_cate['type']
- display(user_cate.head(),user_cate.shape)
- actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])
- del user_cate
- gc.collect()
- display(actions.head(),actions.shape)
用户对同类别下其他商品的行为计数
- prefix = 'action_before_%s' % 3
- actions[prefix+'_1_y'] = actions[prefix+'_1.0_y'] - actions[prefix+'_1.0_x']
- display(actions.head(),actions.shape)
- del actions
- gc.collect()
- def get_accumulate_user_feat(end_date, all_actions, day):
- start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=day)
- start_date = start_date.strftime('%Y-%m-%d')
- prefix = 'user_action_%s' % day
-
- actions = get_actions(start_date, end_date, all_actions)
- df = pd.get_dummies(actions['type'], prefix=prefix) # 独热编码
-
- actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
- actions = pd.concat([actions[['user_id', 'date']], df], axis=1)
- del df
- gc.collect()
- # 分组统计,按用户分组,统计用户各项行为的转化率、均值
- actions = actions.groupby(['user_id'], as_index=False).sum()
- actions[prefix + '_1_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_1.0'])
- actions[prefix + '_2_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_2.0'])
- actions[prefix + '_3_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_3.0'])
- actions[prefix + '_5_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_5.0'])
- actions[prefix + '_6_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_6.0'])
- # 均值
- actions[prefix + '_1_mean'] = actions[prefix + '_1.0'] / day
- actions[prefix + '_2_mean'] = actions[prefix + '_2.0'] / day
- actions[prefix + '_3_mean'] = actions[prefix + '_3.0'] / day
- actions[prefix + '_4_mean'] = actions[prefix + '_4.0'] / day
- actions[prefix + '_5_mean'] = actions[prefix + '_5.0'] / day
- actions[prefix + '_6_mean'] = actions[prefix + '_6.0'] / day
- return actions
代码测试
np.log2(16) - np.log2(32) # -1.0
3.6.2.1.2 代码解读
加载一定时间段内所有数据
- prefix = 'user_action_%s' % 3
- all_actions = get_all_action()
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
- # 获取一定时间段内数据
- actions = get_actions(start_date, end_date, all_actions)
- display(actions.head(),actions.shape)
- del all_actions
- gc.collect()
用户行为统计计数
- df = pd.get_dummies(actions['type'], prefix=prefix)
- display(df.head(),df.shape)
- actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
- actions = pd.concat([actions[['user_id', 'date']], df], axis=1)
- actions = actions.groupby(['user_id'],as_index=False).sum()
- display(actions.head(),actions.shape)
不同行为转购率和均值
- actions[prefix + '_1_ratio'] = np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_1.0'])
- actions[prefix + '_1_mean'] = actions[prefix + '_1.0'] / 3
- actions.head(20)
3.6.2.2 用户近期行为特征
在上面针对用户进行累积特征提取的基础上,分别提取用户近一个月、近三天的特征,然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重
- def get_recent_user_feat(end_date, all_actions):
- actions_3 = get_accumulate_user_feat(end_date, all_actions, 3)
- actions_30 = get_accumulate_user_feat(end_date, all_actions, 30)
- actions = pd.merge(actions_3, actions_30, how ='left', on='user_id')
- del actions_3
- del actions_30
- gc.collect()
-
- actions['recent_action1'] = np.log(1 + actions['user_action_30_1.0']-actions['user_action_3_1.0']) - np.log(1 + actions['user_action_30_1.0'])
- actions['recent_action2'] = np.log(1 + actions['user_action_30_2.0']-actions['user_action_3_2.0']) - np.log(1 + actions['user_action_30_2.0'])
- actions['recent_action3'] = np.log(1 + actions['user_action_30_3.0']-actions['user_action_3_3.0']) - np.log(1 + actions['user_action_30_3.0'])
- actions['recent_action4'] = np.log(1 + actions['user_action_30_4.0']-actions['user_action_3_4.0']) - np.log(1 + actions['user_action_30_4.0'])
- actions['recent_action5'] = np.log(1 + actions['user_action_30_5.0']-actions['user_action_3_5.0']) - np.log(1 + actions['user_action_30_5.0'])
- actions['recent_action6'] = np.log(1 + actions['user_action_30_6.0']-actions['user_action_3_6.0']) - np.log(1 + actions['user_action_30_6.0'])
-
- return actions
3.6.2.3 用户对大类别商品交互行为特征工程
- #增加了用户对不同类别的交互特征
- def get_user_cate_feature(start_date, end_date, all_actions):
- actions = get_actions(start_date, end_date, all_actions)
- actions = actions[['user_id', 'cate', 'type']]
- df = pd.get_dummies(actions['type'], prefix='type')
- actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
-
- actions = actions.groupby(['user_id', 'cate']).sum()
-
- actions = actions.unstack()
- actions.columns = actions.columns.swaplevel(0, 1)
- actions.columns = actions.columns.droplevel()
-
- actions.columns = [
- 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',
- 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',
- 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',
- 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',
- 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',
- 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',
- 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',
- 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',
- 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',
- 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',
- 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',
- 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6']
-
- actions = actions.fillna(0)
-
- actions['cate_action_sum'] = actions.sum(axis=1)
-
- # 用户对各个类别操作行为统计占对所有类别操作行为统计的比重
- actions['cate8_percentage'] = (
- actions['cate_8_type1'] + actions['cate_8_type2'] +
- actions['cate_8_type3'] + actions['cate_8_type4'] +
- actions['cate_8_type5'] + actions['cate_8_type6']) / actions['cate_action_sum']
-
- actions['cate4_percentage'] = (
- actions['cate_4_type1'] + actions['cate_4_type2'] +
- actions['cate_4_type3'] + actions['cate_4_type4'] +
- actions['cate_4_type5'] + actions['cate_4_type6']) / actions['cate_action_sum']
- actions['cate5_percentage'] = (
- actions['cate_5_type1'] + actions['cate_5_type2'] +
- actions['cate_5_type3'] + actions['cate_5_type4'] +
- actions['cate_5_type5'] + actions['cate_5_type6']) / actions['cate_action_sum']
- actions['cate6_percentage'] = (
- actions['cate_6_type1'] + actions['cate_6_type2'] +
- actions['cate_6_type3'] + actions['cate_6_type4'] +
- actions['cate_6_type5'] + actions['cate_6_type6']) / actions['cate_action_sum']
- actions['cate7_percentage'] = (
- actions['cate_7_type1'] + actions['cate_7_type2'] +
- actions['cate_7_type3'] + actions['cate_7_type4'] +
- actions['cate_7_type5'] + actions['cate_7_type6']) / actions['cate_action_sum']
- actions['cate9_percentage'] = (
- actions['cate_9_type1'] + actions['cate_9_type2'] +
- actions['cate_9_type3'] + actions['cate_9_type4'] +
- actions['cate_9_type5'] + actions['cate_9_type6']) / actions['cate_action_sum']
- actions['cate10_percentage'] = (
- actions['cate_10_type1'] + actions['cate_10_type2'] +
- actions['cate_10_type3'] + actions['cate_10_type4'] +
- actions['cate_10_type5'] + actions['cate_10_type6']) / actions['cate_action_sum']
- actions['cate11_percentage'] = (
- actions['cate_11_type1'] + actions['cate_11_type2'] +
- actions['cate_11_type3'] + actions['cate_11_type4'] +
- actions['cate_11_type5'] + actions['cate_11_type6']) / actions['cate_action_sum']
-
- actions['cate8_type1_percentage'] = np.log(
- 1 + actions['cate_8_type1']) - np.log(
- 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +
- actions['cate_5_type1'] + actions['cate_6_type1'] +
- actions['cate_7_type1'] + actions['cate_9_type1'] +
- actions['cate_10_type1'] + actions['cate_11_type1'])
-
- actions['cate8_type2_percentage'] = np.log(
- 1 + actions['cate_8_type2']) - np.log(
- 1 + actions['cate_8_type2'] + actions['cate_4_type2'] +
- actions['cate_5_type2'] + actions['cate_6_type2'] +
- actions['cate_7_type2'] + actions['cate_9_type2'] +
- actions['cate_10_type2'] + actions['cate_11_type2'])
- actions['cate8_type3_percentage'] = np.log(
- 1 + actions['cate_8_type3']) - np.log(
- 1 + actions['cate_8_type3'] + actions['cate_4_type3'] +
- actions['cate_5_type3'] + actions['cate_6_type3'] +
- actions['cate_7_type3'] + actions['cate_9_type3'] +
- actions['cate_10_type3'] + actions['cate_11_type3'])
- actions['cate8_type4_percentage'] = np.log(
- 1 + actions['cate_8_type4']) - np.log(
- 1 + actions['cate_8_type4'] + actions['cate_4_type4'] +
- actions['cate_5_type4'] + actions['cate_6_type4'] +
- actions['cate_7_type4'] + actions['cate_9_type4'] +
- actions['cate_10_type4'] + actions['cate_11_type4'])
- actions['cate8_type5_percentage'] = np.log(
- 1 + actions['cate_8_type5']) - np.log(
- 1 + actions['cate_8_type5'] + actions['cate_4_type5'] +
- actions['cate_5_type5'] + actions['cate_6_type5'] +
- actions['cate_7_type5'] + actions['cate_9_type5'] +
- actions['cate_10_type5'] + actions['cate_11_type5'])
- actions['cate8_type6_percentage'] = np.log(
- 1 + actions['cate_8_type6']) - np.log(
- 1 + actions['cate_8_type6'] + actions['cate_4_type6'] +
- actions['cate_5_type6'] + actions['cate_6_type6'] +
- actions['cate_7_type6'] + actions['cate_9_type6'] +
- actions['cate_10_type6'] + actions['cate_11_type6'])
-
- actions['user_id'] = actions.index
- actions = actions[[
- 'user_id', 'cate8_percentage', 'cate4_percentage', 'cate5_percentage',
- 'cate6_percentage', 'cate7_percentage', 'cate9_percentage',
- 'cate10_percentage', 'cate11_percentage', 'cate8_type1_percentage',
- 'cate8_type2_percentage', 'cate8_type3_percentage',
- 'cate8_type4_percentage', 'cate8_type5_percentage',
- 'cate8_type6_percentage']]
- return actions
3.6.2.3.2 代码解读
加载一定时间段内所有数据
- prefix = 'user_action_%s' % 3
- all_actions = get_all_action()
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
- # 获取一定时间段内数据
- actions = get_actions(start_date, end_date, all_actions)
- actions = actions[['user_id', 'cate', 'type']]
- display(actions.head(),actions.shape)
- del all_actions
- gc.collect()
用户类别分组聚合
- df = pd.get_dummies(actions['type'], prefix='type')
- actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
- actions = actions.groupby(['user_id', 'cate']).sum()
- actions.head(20)
行索引变列索引
- actions = actions.unstack()
- actions.head()
交换列索引层级
- actions.columns = actions.columns.swaplevel(0, 1)
- actions.head()
删除第一层列索引
- actions.columns = actions.columns.droplevel()
- actions.head()
列索引重新赋值
- actions.columns = [
- 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',
- 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',
- 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',
- 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',
- 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',
- 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',
- 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',
- 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',
- 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',
- 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',
- 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',
- 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6']
- actions.head()
空数据填充并求和
- actions = actions.fillna(0)
- display(actions.head())
- actions['cate_action_sum'] = actions.sum(axis=1)
- actions.head()
用户对类别8操作行为统计占对所有类别操作行为统计的比重
- actions['cate8_percentage'] = (
- actions['cate_8_type1'] + actions['cate_8_type2'] +
- actions['cate_8_type3'] + actions['cate_8_type4'] +
- actions['cate_8_type5'] + actions['cate_8_type6']) / actions['cate_action_sum']
- actions.head()
类别8-交互1占总交互1比例
- actions['cate8_type1_percentage'] = np.log(1 + actions['cate_8_type1'])- np.log(
- 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +
- actions['cate_5_type1'] + actions['cate_6_type1'] +
- actions['cate_7_type1'] + actions['cate_9_type1'] +
- actions['cate_10_type1'] + actions['cate_11_type1'])
- actions.head()
- actions['user_id'] = actions.index
- actions.head()
3.6.3 商品-行为
- def get_accumulate_product_feat(start_date, end_date, all_actions):
- actions = get_actions(start_date, end_date, all_actions)
- df = pd.get_dummies(actions['type'], prefix='product_action')
- # 按照商品-日期分组,计算某个时间段该商品的各项行为的标准差
- actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
- actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)
-
- actions = actions.groupby(['sku_id'], as_index=False).sum()
- # 时间间隔,起始时间 终止时间,间隔
- days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(start_date, '%Y-%m-%d')).days
-
- # 针对商品分组,计算购买转化率
- actions['product_action_1_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])
- actions['product_action_2_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_2.0'])
- actions['product_action_3_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_3.0'])
- actions['product_action_5_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_5.0'])
- actions['product_action_6_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_6.0'])
- # 计算各种行为的均值
- actions['product_action_1_mean'] = actions['product_action_1.0'] / days_interal
- actions['product_action_2_mean'] = actions['product_action_2.0'] / days_interal
- actions['product_action_3_mean'] = actions['product_action_3.0'] / days_interal
- actions['product_action_4_mean'] = actions['product_action_4.0'] / days_interal
- actions['product_action_5_mean'] = actions['product_action_5.0'] / days_interal
- actions['product_action_6_mean'] = actions['product_action_6.0'] / days_interal
- return actions
3.6.3.2 代码解读
加载一定时间段内所有数据
- prefix = 'user_action_%s' % 3
- all_actions = get_all_action()
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
- # 获取一定时间段内数据
- actions = get_actions(start_date, end_date, all_actions)
- display(actions.head(),actions.shape)
- del all_actions
- gc.collect()
商品分组聚合
- df = pd.get_dummies(actions['type'], prefix='product_action')
- actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
- actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)
- actions = actions.groupby(['sku_id'], as_index=False).sum()
- actions.head()
actions.head(50)
商品不同行为的转购率和均值计算
- days_interal = (datetime.strptime(end_date, '%Y-%m-%d') -
- datetime.strptime(start_date, '%Y-%m-%d')).days
- print('时间间隔',days_interal)
- actions['product_action_1_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])
- actions['product_action_1_mean'] = actions['product_action_1.0'] / days_interal
- actions.head()
分时间段下各个商品类别的
- def get_accumulate_cate_feat(start_date, end_date, all_actions):
- actions = get_actions(start_date, end_date, all_actions)
- actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
- df = pd.get_dummies(actions['type'], prefix='cate_action')
- actions = pd.concat([actions[['cate','date']], df], axis=1)
-
- # 按照类别分组,统计各个商品类别下行为的转化率
- actions = actions.groupby(['cate'], as_index=False).sum()
-
- days_interal = (datetime.strptime(end_date, '%Y-%m-%d')-datetime.strptime(start_date, '%Y-%m-%d')).days
-
- actions['cate_action_1_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_1.0']))
- actions['cate_action_2_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_2.0']))
- actions['cate_action_3_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_3.0']))
- actions['cate_action_5_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_5.0']))
- actions['cate_action_6_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_6.0']))
- # 按照类别分组,统计各个商品类别下行为在一段时间的均值
- actions['cate_action_1_mean'] = actions['cate_action_1.0'] / days_interal
- actions['cate_action_2_mean'] = actions['cate_action_2.0'] / days_interal
- actions['cate_action_3_mean'] = actions['cate_action_3.0'] / days_interal
- actions['cate_action_4_mean'] = actions['cate_action_4.0'] / days_interal
- actions['cate_action_5_mean'] = actions['cate_action_5.0'] / days_interal
- actions['cate_action_6_mean'] = actions['cate_action_6.0'] / days_interal
- return actions
购买行为标记
- def get_labels(start_date, end_date, all_actions):
- actions = get_actions(start_date, end_date, all_actions)
-
- # 修改为预测购买了商品8的用户预测
- actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]
- actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
- actions['label'] = 1
- actions = actions[['user_id', 'sku_id', 'label']]
- return actions
查看用户数据结构
- # 查看全部数据
- all_actions = get_all_action()
- print ("get all actions!")
- display(all_actions.head(),all_actions.shape)
- del all_actions
- gc.collect()
- # 用户特征
- user = get_basic_user_feat()
- print ('get_basic_user_feat finsihed')
- display(user.head(),user.shape)
- del user
- gc.collect()
- # 商品基本特征
- product = get_basic_product_feat()
- print ('get_basic_product_feat finsihed')
- display(product.head(),product.shape)
- del product
- gc.collect()
- # 用户近期行为特征
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d') # 转为字符串
- all_actions = get_all_action()
- user_acc = get_recent_user_feat(end_date, all_actions)
- display(user_acc.head(),user_acc.shape)
- del all_actions,user_acc
- gc.collect()
- print ('get_recent_user_feat finsihed')
3.7.1.2 构造训练集
特征工程-构建函数创建新特征
- start_date = '2016-02-01'
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d') # 转为字符串
- all_actions = get_all_action()
- user_cate = get_user_cate_feature(start_date, end_date, all_actions)
- display(user_cate.head())
- user_cate = user_cate.reset_index(drop = True)# 处理索引
- display(user_cate.head())
- del all_actions,user_cate
- gc.collect()
- def make_actions(user, product, all_actions, start_date):
- end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
- end_date = end_date.strftime('%Y-%m-%d')
- # 修正get_accumulate_product_feat,get_accumulate_cate_feat的时间跨度
- start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=30)
- start_days = start_days.strftime('%Y-%m-%d')
- print (end_date)
- user_acc = get_recent_user_feat(end_date, all_actions)
- print ('get_recent_user_feat finsihed')
-
- user_cate = get_user_cate_feature(start_date, end_date, all_actions)
- user_cate = user_cate.reset_index(drop = True)# 处理索引
- print ('get_user_cate_feature finished')
-
- product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)
- print ('get_accumulate_product_feat finsihed')
- cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)
- print ('get_accumulate_cate_feat finsihed')
- comment_acc = get_comments_product_feat(end_date)
- print ('get_comments_product_feat finished')
- # 标记
- test_start_date = end_date
- test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)
- test_end_date = test_end_date.strftime('%Y-%m-%d')
- labels = get_labels(test_start_date, test_end_date, all_actions)
- print ("get labels")
-
- actions = None
- for i in (3, 5, 7, 10, 15, 21, 30):
- start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
- start_days = start_days.strftime('%Y-%m-%d')
- if actions is None:
- actions = get_action_feat(start_days, end_date, all_actions, i)
- else:
- # 注意这里的拼接key
- actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions, i),
- how='left',
- on=['user_id', 'sku_id', 'cate'])
-
- actions = pd.merge(actions, user, how='left', on='user_id')
- actions = pd.merge(actions, user_acc, how='left', on='user_id')
- actions = pd.merge(actions, user_cate, how='left', on='user_id')
- # 注意这里的拼接key
- actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])
- actions = pd.merge(actions, product_acc, how='left', on='sku_id')
- actions = pd.merge(actions, cate_acc, how='left', on='cate')
- actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
- actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
- # 主要是填充拼接商品基本特征、评论特征、标签之后的空值
- actions = actions.fillna(0)
- # 采样
- action_postive = actions[actions['label'] == 1] # 购买
- action_negative = actions[actions['label'] == 0] # 没有购买
- del actions
- neg_len = len(action_postive) * 10 # 负样本是正样本数量的10倍
- action_negative = action_negative.sample(n=neg_len)
- action_sample = pd.concat([action_postive, action_negative], ignore_index=True)
- return action_sample
构造训练数据集
- def make_train_set(start_date, setNums ,f_path, all_actions):
- train_actions = None
- user = get_basic_user_feat()
- print ('get_basic_user_feat finsihed')
- product = get_basic_product_feat()
- print ('get_basic_product_feat finsihed')
- # 滑窗,构造多组训练集/验证集
- for i in range(setNums):
- print(start_date)
- if train_actions is None:
- train_actions = make_actions(user, product, all_actions, start_date)
- else:
- train_actions = pd.concat([train_actions,
- make_actions(user, product, all_actions, start_date)],
- ignore_index=True)
- # 接下来每次移动一天
- start_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=1)
- start_date = start_date.strftime('%Y-%m-%d')
- print ("round {0}/{1} over!".format(i+1, setNums))
- train_actions.to_csv(f_path, index=False)
- del train_actions
-
- # 训练集 & 验证集
- start_date = '2016-03-01'
- all_actions = get_all_action()
- make_train_set(start_date, 20, 'train_set.csv',all_actions)
- del all_actions
- gc.collect()
- # 测试集
- val_start_date = '2016-04-01'
- all_actions = get_all_action()
- make_train_set(val_start_date, 3, 'test_set.csv',all_actions)
- del all_actions
- gc.collect()
- import pandas as pd
- import numpy as np
- import xgboost as xgb
- from sklearn.model_selection import train_test_split
- from matplotlib import pylab as plt
- import gc
- data = pd.read_csv('train_set.csv')
- display(data.head(), data.shape)
- data_X = data.loc[:, data.columns != 'label']
- data_y = data.loc[:, data.columns == 'label']
- X_train,X_val,y_train, y_val = train_test_split(data_X, data_y, test_size= 0.2, random_state= 0) # validation 验证
- users = X_val[['user_id', 'sku_id', 'cate']].copy()
- # 删除用户ID和商品编号, 这两列属于自然数编号, 对预测结果影响不大
- del X_train['user_id']
- del X_train['sku_id']
-
- display(X_train.head(), X_train.shape)
- display(X_val.head(), X_val.shape)
- del data, data_X, data_y
- gc.collect()
- dtrain = xgb.DMatrix(X_train, label=y_train)
- dvalid = xgb.DMatrix(X_val, label=y_val)
- '''
- 'min_child_weight': 5,孩子节点中最小的样本权重和。
- 如果一个叶子节点的样本权重和小于min_child_weight则拆分过程结束。即调大这个参数能够控制过拟合。
- gamma = 0.1,# 树的叶子节点上做进一步分区所需的最小损失减少,越大越保守,一般0.1 0.2这样子
- scale_pos_weight =10 # 如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛,平衡正负权重
- 'eta': 0.1, # 如同学习率'''
- param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0.1,
- 'subsample': 0.9,'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1,
- 'objective': 'binary:logistic','eval_metric':['auc','error']}
-
- num_round = param['n_estimators']
- evallist = [(dtrain, 'train'), (dvalid, 'eval')]
- bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)
- bst.save_model('bst.model')
- def feature_importance(bst_xgb):
- importance = bst_xgb.get_fscore()
- importance = sorted(importance.items(), key= lambda x :x[1], reverse= True)
- df = pd.DataFrame(importance, columns= ['feature', 'fscore'])
- df['fscore'] = df['fscore'] / df['fscore'].sum()
- file_name = 'feature_importance_.csv'
- df.to_csv(file_name, index = False)
- feature_importancce(bst)
-
- feature_importance_ = pd.read_csv('feature_importance_.csv')
- feature_importance_.head()
查看验证数据
算法预测
- X_val_DMatrix = xgb.DMatrix(X_val)
- y_pred = bst.predict(X_val_DMatrix)
- X_val['pred_label'] = y_pred
- X_val.head()
目标值概率转分类
- del label(column):
- if column['pred_label'] > 0.5:
- column['pred_label'] = 1
- else:
- column['pred_label'] = 0
- return column
- X_val = X_val.apply(label,axis = 1)
- X_val.head()
添加真实值用户ID商品编号
- X_val['true_label'] = y_val
- X_val['user_id'] = users['user_id']
- X_val['sku_id'] = users['sku_id']
- X_val.head()
购买用户统计
- # 所有购买用户
- all_user_set = X_val[X_val['true_label'] == 1]['user_id'].unique()
- print(len(all_user_set))
-
- # 所有预测购买的用户
- all_user_test_set = X_val[X_val['pred_label'] ==1]['user_id'].unique()
- print(len(all_user_test_set))
准备召回率
- pos, neg = 0,0
- for user_id in all_user_test_set:
- if user_id in all_user_set:
- pos +=1
- else:
- neg +=1
- all_user_acc = 1.0 * pos / (pos + neg)
- all_user_recall = 1.0 * pos / len(all_user_set)
- print('所有用户中预测购买用户的准确率' + str(all_user_acc))
- print('所有用户中预测购买用户的召回率' + str(all_user_recall))
实际商品对 准确率 召回率(更加精细:用户-商品ID)
- # 所有预测购买用户商品对应关系
- all_user_test_item_pair = X_val[X_val['pred_label'] == 1]['user_id'].map(str) + '-' + X_val[X_val['pred_label'] == 1]['sku_id'].map(str)
- all_user_test_item_pair = np.array(all_user_test_item_pair)
- print(len(all_user_test_item_pair))
- # 所有实际商品对
- all_user_item_pair = X_val[X_val['true_label'] ==1]['user_id'].map(str) + '-' + X_val[X_val['true_label'] ==1]['sku_id'].map(str)
- all_user_item_pair = np.array(all_user_item_pair)
-
- pos, neg = 0, 0
- for user_item_pair in all_user_test_item_pair:
- if user_item_pair in all_user_item_pair:
- pos += 1
- else:
- neg += 1
- all_item_acc = pos / (pos + neg)
- all_item_recall = pos / len(all_user_item_pair)
- print('所有用户中预测购买用户的准确率' + str(all_item_acc))
- print('所有用户中预测购买用户的召回率' + str(all_item_recall))
数据加载
- X_data = pd.read_csv('test_set.csv')
- display(X_data.head())
- X_test, y_test = X_data.iloc[:,:-1], X_data.iloc[:,-1]
算法预测
- users = X_test[['user_id', 'sku_id', 'cate']].copy()
- del X_test['user_id']
- del X_test['sku_id']
-
- X_test_DMatrix = xgb.DMatrix(X_test)
- y_pred = bst.predict(X_test_DMatrix)
- X_test['pred_label'] = y_pred
- X_test.head()
目标值概率转分类
- def label(column):
- if column['pred_label'] > 0.5:
- column['pred_label'] = 1
- else:
- column['pred_label'] = 0
- return column
- X_test = X_test.apply(label, axis = 1)
- X_test.head()
添加真实用户ID 商品信息
- X_test['true_label'] = y_test
- X_test['user_id'] = users['user_id']
- X_test['sku_id'] = users['sku_id']
- X_test.head()
购买用户统计
- # 所有购买用户
- all_user_set = X_test[X_test['true_label'] == 1]['user_id'].unique()
- print(len(all_user_set))
-
- # 所有预测购买的用户
- all_user_test_set = X_test[X_test['pred_label'] == 1]['user_id'].unique()
- print(len(all_user_test_set))
准确率 召回率
- pos, neg = 0,0
- for user_id in all_user_test_set:
- if user_id in all_user_set:
- pos += 1
- else:
- neg += 1
- all_user_acc = pos /(pos + neg)
- all_user_recall = pos / len(all_user_set)
- print('所有用户中预测购买用户的准确率' + str(all_user_acc))
- print('所有用户中预测购买用户的召回率' + str(all_user_recall))
实际商品对 准确率 召回率
- # 所有预测购买用户商品对应关系
- all_user_test_item_pair = X_test[X_test['pred_label'] == 1]['user_id'].map(str) + '-' + X_test[X_test['pred_label'] == 1]['sku_id'].map(str)
- all_user_test_item_pair = np.array(all_user_test_item_pair)
- print(len(all_user_test_item_pair))
- # 所有实际商品对
- all_user_item_pair = X_test[X_test['true_label'] ==1]['user_id'].map(str) + '-' + X_test[X_test['true_label'] ==1]['sku_id'].map(str)
- all_user_item_pair = np.array(all_user_item_pair)
-
- pos, neg = 0, 0
- for user_item_pair in all_user_test_item_pair:
- if user_item_pair in all_user_item_pair:
- pos += 1
- else:
- neg += 1
- all_item_acc = pos / (pos + neg)
- all_item_recall = pos / len(all_user_item_pair)
- print('所有用户中预测购买用户的准确率' + str(all_item_acc))
- print('所有用户中预测购买用户的召回率' + str(all_item_recall))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。