赞
踩
本文主要使用jupyter lab 进行数据分析
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
-
- plt.rcParams['font.sans-serif'].insert(0, 'SimHei') # 设置中文字体
- plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
'运行
- orders_df = pd.read_excel('data/某电商平台2021年订单数据.xlsx', index_col='id')
- orders_df.info()
修改列的字段名
df.rename(columns={' ' : ' ', ' ' : ' ' })
就地修改
inplace=True # 函数将会就地修改对象,而不返回新的对象。这意味着对原始对象的更改会直接生效,而不需要为新对象分配额外的内存空间。
'运行
删除对应的行和列
- # 删除列
- df.drop('A', axis=1, inplace=True) # 在原始 DataFrame 中删除列 'A'
-
- # 删除行
- df.drop(0, axis=0, inplace=True) # 在原始 DataFrame 中删除第一行
根据条件在 Series 中对值进行过滤
- # 使用 where 函数进行过滤
- filtered_s = s.where(s > 3, 0) # 如果值大于3,保留原值,否则用0替换
-
- # 使用 mask 函数进行过滤
- masked_s = s.mask(s > 3, -1) # 如果值大于3,用-1替换,否则保留原值
- # 修改渠道字段的列名为channelID,平台字段的列名为platformType
- orders_df.rename(columns={'chanelID': 'channelID', 'platfromType': 'platformType'}, inplace=True)
-
-
- # 筛选出2021年的订单(orderTime在2021年)
- index = orders_df[orders_df.orderTime.dt.year != 2021].index
- orders_df.drop(index=index, inplace=True)
-
-
- # 删除支付金额(payment)小于0的订单,删除支付时间(payTime)早于下单时间(orderTime)的订单
- index = orders_df[(orders_df.payment < 0) | (orders_df.payTime < orders_df.orderTime)].index
- orders_df.drop(index=index, inplace=True)
-
-
- # 删除支付时长超过30分钟的订单
- index = orders_df[(orders_df.payTime - orders_df.orderTime).dt.total_seconds() > 1800].index
- orders_df.drop(index=index, inplace=True)
-
-
- # 对于支付金额(payment)大于订单金额(orderAmount)的订单,将支付金额修改为订单金额乘以平均折扣率
- a, b = orders_df[orders_df.payment <= orders_df.orderAmount][['orderAmount', 'payment']].sum()
- mean_discount = b / a
- # Series.where - 满足条件保留原来的值,不满足条件用第二个参数替换掉
- # Series.mask - 满足条件的被替换掉,不满足条件的保留原来的值
- orders_df['payment'] = orders_df.payment.where(
- orders_df.payment <= orders_df.orderAmount,
- (orders_df.orderAmount * mean_discount).round(2)
- )
-
-
- # 将渠道(channelID)字段的空值填充为众数
- most_common_channel = orders_df.channelID.mode()[0]
- orders_df.channelID.fillna(most_common_channel, inplace=True)
- orders_df.info()
-
-
- # 处理掉平台(platformType)字段的异常值(最后只有四种值:微信、支付宝、App、Web)
- orders_df['platformType'] = orders_df.platformType.str.replace(
- r'[\s·]', '', regex=True
- ).str.title(
- ).str.replace(
- r'薇信|Vx', '微信', regex=True
- ).str.replace(
- r'网页|网站', 'Web', regex=True
- )
- orders_df.platformType.unique()
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- # 统计出核心指标,包括:GMV、净销售额、客单价、拒退率、复购率
- gmv = orders_df.orderAmount.sum()
- sales = orders_df.query('not chargeback').payment.sum()
- arppu = sales / orders_df.userID.nunique()
- brate = orders_df.query('chargeback').orderID.count() / orders_df.orderID.count()
- temp = pd.pivot_table(
- orders_df.query('not chargeback'),
- index='userID',
- values='orderID',
- aggfunc='nunique'
- ).rename(columns={'orderID': 'orderCount'})
- ser = temp.orderCount.map(lambda x: 1 if x > 1 else 0)
- rrate = ser.sum() / ser.count()
- # 统计月度GMV和净销售额
- orders_df['month'] = orders_df.orderTime.dt.month
- temp1 = pd.pivot_table(
- orders_df,
- index='month',
- values='orderAmount',
- aggfunc='sum'
- ).round(2).rename(columns={'orderAmount': 'GMV'})
- temp2 = pd.pivot_table(
- orders_df.query('not chargeback'),
- index='month',
- values='payment',
- aggfunc='sum'
- ).round(2).rename(columns={'payment': '净销售额'})
- # pd.merge(temp1, temp2, how='inner', left_index=True, right_index=True)
- temp = pd.concat((temp1, temp2), axis=1)
- temp
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- temp.plot(kind='line', figsize=(10, 4), xlabel='', linewidth=0.5, linestyle='--', marker='^')
- plt.ylim(0, 14000000)
- plt.xticks(temp.index)
- plt.legend(loc='lower right')
- plt.show()
指定 PyEcharts 在 Jupyter Lab 环境中正确显示图表
- from pyecharts.globals import CurrentConfig, NotebookType
-
- CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
准备对应的数据
- months = [f'{x}月' for x in temp.index.values.tolist()]
- gmv = (temp.GMV / 1000000).round(2).tolist()
- sales = (temp.净销售额 / 1000000).round(2).tolist()
- import pyecharts.options as opts
- from pyecharts.charts import Line
-
- line_chart = Line(init_opts=opts.InitOpts(width='1000px', height='500px'))
- line_chart.add_xaxis(months)
- line_chart.add_yaxis("GMV", gmv)
- line_chart.add_yaxis(
- "净销售额",
- sales,
- markline_opts=opts.MarkLineOpts(
- data=[
- opts.MarkLineItem(type_="average"),
- ]
- )
- )
- line_chart.load_javascript()
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
line_chart.render_notebook()
- ser = orders_df.query('not chargeback').groupby('channelID')['payment'].sum()
- ser
- import pyecharts.options as opts
- from pyecharts.charts import Pie
-
- x_data = ser.index.values.tolist()
- y_data = ser.values.tolist()
- data = [z for z in zip(x_data, y_data)]
-
- pie_chart = Pie()
- pie_chart.add(
- series_name="",
- data_pair=data,
- radius=["50%", "70%"],
- )
- pie_chart.set_global_opts(
- legend_opts=opts.LegendOpts(is_show=False)
- )
- pie_chart.set_series_opts(
- tooltip_opts=opts.TooltipOpts(
- trigger="item",
- formatter="{b}: {c}"
- ),
- label_opts=opts.LabelOpts(formatter="{b}: {d}%")
- )
- pie_chart.load_javascript()
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
pie_chart.render_notebook()
- # 统计星期几用户下单量最高
- orders_df['weekday'] = (orders_df.orderTime.dt.weekday + 1) % 7
- temp = orders_df.pivot_table(
- index='weekday',
- values='orderID',
- aggfunc='nunique'
- )
- temp.plot(kind='bar', figsize=(8, 4), xlabel='', legend=False)
- for i in np.arange(7):
- plt.text(i, temp.orderID[i] + 100, temp.orderID[i], ha='center', fontdict=dict(size=9))
- plt.xticks(np.arange(7), labels=[f'星期{x}' for x in '日一二三四五六'], rotation=0)
- plt.show()
- # 统计每天哪个时段用户下单量最高
- orders_df['time'] = orders_df.orderTime.dt.floor('30T').dt.strftime('%H:%M')
- temp = pd.pivot_table(
- orders_df,
- index='time',
- values='orderID',
- aggfunc='nunique'
- )
- plt.figure(figsize=(10, 4), dpi=200)
- plt.bar(temp.index, temp.orderID, color=np.random.random((8, 3)), width=0.8)
- plt.xticks(rotation=75)
- plt.show()
- # 以自然月为窗口计算每个月的复购率
- temp = pd.pivot_table(
- orders_df.query('not chargeback'),
- index='userID',
- columns='month',
- values='orderID',
- aggfunc='nunique'
- )
- temp = temp.map(lambda x: x if np.isnan(x) else (1 if x > 1 else 0))
- temp.sum() / temp.count()
- from datetime import datetime
-
- # 用RFM模型实现用户价值分群
- rfm_model_df = pd.pivot_table(
- orders_df.query('not chargeback'),
- index='userID',
- values=['orderTime', 'orderID', 'payment'],
- aggfunc={
- 'orderTime': 'max',
- 'orderID': 'nunique',
- 'payment': 'sum'
- }
- ).rename(columns={'orderTime': 'R', 'orderID': 'F', 'payment': 'M'})
- rfm_model_df['R'] = (datetime(2021, 12, 31, 23, 59, 59) - rfm_model_df.R).dt.days
- # reindex - 调整行索引或列索引的顺序(也可以用花式索引来调整)
- rfm_model_df = rfm_model_df.reindex(columns=['R', 'F', 'M'])
- rfm_model_df
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- # 将RFM模型的原始值处理成对应的等级
- def handle_r(value):
- if value <= 14:
- return 5
- elif value <= 30:
- return 4
- elif value <= 90:
- return 3
- elif value <= 180:
- return 2
- return 1
-
-
- def handle_f(value):
- if value >= 5:
- return 5
- return value
-
-
- def handle_m(value):
- if value < 500:
- return 1
- elif value < 800:
- return 2
- elif value < 1200:
- return 3
- elif value < 2500:
- return 4
- return 5
'运行
- rfm_model_df['R'] = rfm_model_df.R.apply(handle_r)
- rfm_model_df['F'] = rfm_model_df.F.apply(handle_f)
- rfm_model_df['M'] = rfm_model_df.M.apply(handle_m)
- rfm_model_df.mean()
- rfm_model_df = (rfm_model_df > rfm_model_df.mean()).map(lambda x: '高' if x else '低')
- rfm_model_df
- rfm_model_df['Tag'] = rfm_model_df.sum(axis=1)
- rfm_model_df
- rfm_model_df['Tag'] = rfm_model_df.Tag.astype('category').cat.reorder_categories(
- ['高高高', '高低高', '低高高', '低低高', '高高低', '高低低', '低高低', '低低低']
- )
- result = rfm_model_df.Tag.value_counts().sort_index()
- result
- result.plot(
- kind='pie',
- autopct='%.2f%%',
- pctdistance=0.8,
- ylabel=''
- )
- plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。