赞
踩
import numpy as np
import pandas as pd
place_order_time = data['place_order_time'].astype('str')
data['place_order_time'] = pd.to_datetime(place_order_time)
print(data['place_order_time'].head())
输出:
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
Name: place_order_time, dtype: datetime64[ns]
ser.dt常用时间属性:
year/month/day/hour/minute/second/date
data['year'] = data['place_order_time'].dt.year
data['place_order_time'].dt.week.head() # 一年的第几周
0 31
1 31
2 31
3 31
4 31
Name: place_order_time, dtype: int64
time1 = data['place_order_time'].head()
# 两个时间表的索引不同时无法进行运算
time2 = data['place_order_time'].tail().reset_index()['place_order_time'] # 重置索引,但会生成新的一列来保留原来的索引,
print(time2-time1)
0 9 days 10:50:48
1 9 days 10:49:41
2 9 days 10:54:12
3 9 days 10:52:47
4 9 days 10:53:00
Name: place_order_time, dtype: timedelta64[ns]
# 往后平移一天
data['place_order_time']+pd.Timedelta(days=1)
# 往后平移一小时
(data['place_order_time']+pd.Timedelta(hours=1)).head()
0 2016-08-01 12:05:36
1 2016-08-01 12:07:07
2 2016-08-01 12:07:40
3 2016-08-01 12:11:11
4 2016-08-01 12:11:30
Name: place_order_time, dtype: datetime64[ns]
groupby:分组函数 - 需要指定分组键
使用:表格名.groupby(分组键)[聚合键].聚合函数()
data.groupby('order_id')[['amounts','counts']].sum().sort_values(by='counts',ascending=False).head()
说明:
分组后生成表:
amounts | counts | |
---|---|---|
order_id | ||
557 | 957 | 30 |
1186 | 655 | 28 |
1146 | 944 | 27 |
1026 | 1021 | 24 |
392 | 704 | 24 |
参数说明:
pd.pivot_table(data,index=['order_id'],aggfunc=[np.mean,np.sum],values=['amounts','counts']).head()
输出:
mean | sum | |||
---|---|---|---|---|
amounts | counts | amounts | counts | |
order_id | ||||
137 | 32.333333 | 1.500000 | 194 | 9 |
165 | 52.944444 | 1.166667 | 953 | 21 |
166 | 48.200000 | 1.400000 | 241 | 7 |
171 | 36.285714 | 1.428571 | 254 | 10 |
177 | 34.250000 | 1.000000 | 137 | 4 |
pd.pivot_table(data,columns=['order_id'],index=['dishes_name'],aggfunc=[np.sum],values=['counts'],fill_value=0,margins=True).head()
添加maigin
sum | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
counts | |||||||||||||||||||||
order_id | 137 | 165 | 166 | 171 | ... | All | |||||||||||||||
dishes_name | |||||||||||||||||||||
42度海之蓝 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 5.0 | |||||||||||||||
北冰洋汽水 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 45.0 | |||||||||||||||
38度剑南春 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 6.0 | |||||||||||||||
50度古井贡酒 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 5.0 | |||||||||||||||
52度泸州老窖 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 8.0 |
5 rows × 279 columns
一般不用
pd.crosstab(index=data['order_id'],columns=data['dishes_name'],values=data['counts'],aggfunc=[np.sum])
pandas/numpy中的统计分析方法(sum/mean/std/var…)都可以直接使用
# 对多列同时使用多个分析方法
data[['amounts','counts']].agg([np.sum,np.mean,np.std])
# 对不同列分别使用不同的分析方法
# data[['amounts','counts']].agg({'amounts':np.mean,'counts':np.sum})
data[['amounts','counts']].agg({'amounts':np.mean,'counts':[np.sum,np.std]})
data.groupby('order_id')[['amounts','counts']].agg([np.mean,np.sum]).head()
在pandas或者numpy中没有现成的函数可以使用,可以通过transform使用自定义的函数
# 写法一:
data['counts'].transform(lambda x:x*2)
# 写法二:
def transform1(value):
values = value*2
return values
data['counts'].transform(transform1)
data2 = pd.read_csv(r'D:/qq文档/2393971222/FileRecv/order-14.3.csv',sep=',',encoding='gbk',engine='python')
print(data.info())
data['dishes_name'].str.strip()
a = data.loc[data['dishes_name'].str.contains('鳝')]
a.drop_duplicates(subset='dishes_name')
data['dishes_name'].drop_duplicates() # 删掉重复值
subset可以传入多个值,传入多个值时,同时满足时才算相同值
data.drop_duplicates(subset=['dishes_name','dishes_id'])
# 纵向合并
df = pd.concat((df1,df2,...),axis=0) # 列索引必须对应一致
# 横向合并
df = pd.concat((df1,df2,...),axis=1) # 行索引必须对应一致
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
输出:
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
重要参数:join=‘outer’:外连接,求并集。join=‘inner’:内连接,求交集。
pd.concat((left,right),axis=0,join='inner')
key1 | key2 | |
---|---|---|
0 | K0 | K0 |
1 | K0 | K1 |
2 | K1 | K0 |
3 | K2 | K1 |
0 | K0 | K0 |
1 | K1 | K0 |
2 | K1 | K0 |
3 | K2 | K0 |
解决行索引没有意义的情况下,数据行不匹配问题(解决concat中横向拼接问题)
参数说明:
pd.merge(left,right,on='key1',how='left')
单主键输出:
A | B | key1 | key2_x | C | D | key2_y | |
---|---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 | K0 |
1 | A1 | B1 | K0 | K1 | C0 | D0 | K0 |
2 | A2 | B2 | K1 | K0 | C1 | D1 | K0 |
3 | A2 | B2 | K1 | K0 | C2 | D2 | K0 |
4 | A3 | B3 | K2 | K1 | C3 | D3 | K0 |
pd.merge(left,right,on=['key1','key2'],how='right')
多主键输出:
A | B | key1 | key2 | C | D | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
1 | A2 | B2 | K1 | K0 | C1 | D1 |
2 | A2 | B2 | K1 | K0 | C2 | D2 |
3 | NaN | NaN | K2 | K0 | C3 | D3 |
pd.merge(left,right,left_on='key1',right_on='key2',how='left')
两表分别指定主键:
A | B | key1_x | key2_x | C | D | key1_y | key2_y | |
---|---|---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 | K0 | K0 |
1 | A0 | B0 | K0 | K0 | C1 | D1 | K1 | K0 |
2 | A0 | B0 | K0 | K0 | C2 | D2 | K1 | K0 |
3 | A0 | B0 | K0 | K0 | C3 | D3 | K2 | K0 |
4 | A1 | B1 | K0 | K1 | C0 | D0 | K0 | K0 |
5 | A1 | B1 | K0 | K1 | C1 | D1 | K1 | K0 |
6 | A1 | B1 | K0 | K1 | C2 | D2 | K1 | K0 |
7 | A1 | B1 | K0 | K1 | C3 | D3 | K2 | K0 |
8 | A2 | B2 | K1 | K0 | NaN | NaN | NaN | NaN |
9 | A3 | B3 | K2 | K1 | NaN | NaN | NaN | NaN |
left.rename(columns={'key1':'key222'},inplace=False)
dict1 = {'ID':[1,2,3,4,5,6,7,8,9],
'System':['W10','w10',np.nan,'w10',np.nan,np.nan,'w7','w7','w8']}
dict2 = {'ID':[1,2,3,4,5,6,7,8,9],
'System':[np.nan,np.nan,'w7','w7','w7','w7','w8',np.nan,np.nan]}
df1 = pd.DataFrame(dict1)
df2 = pd.DataFrame(dict2)
df1.combine_first(df2)
以df1作为主表,利用df2中的表格数据填充df1,当df2中数据与df1不一致的时候,以df1为主。
输出:
ID | System | |
---|---|---|
0 | 1 | W10 |
1 | 2 | w10 |
2 | 3 | w7 |
3 | 4 | w10 |
4 | 5 | w7 |
5 | 6 | w7 |
6 | 7 | w7 |
7 | 8 | w7 |
8 | 9 | w8 |
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。