赞
踩
最近做 AETA地震预测AI算法大赛 这个比赛,看他的baseline代码时,发现了这么几行:
# 取三者的交集,有_magn文件,有_sound文件,而且有 MagnUpdate & SoundUpdate
usable_stations = _continueable_stations & _set_magn & _set_sound
dump_object(Usable_Station_Path, usable_stations)
print('合并数据:')
for type in ('magn', 'sound'):
res = []
for _id in tqdm(usable_stations, desc=f'{type}:'):
# Data_Folder_Path+str(_id)+f'_{type}.csv' ==> e.g. : './data/19_magn.csv'
# Used_features 用来取对应的特征
_df = pd.read_csv(Data_Folder_Path+str(_id)+f'_{type}.csv')[Used_features[type]]
res.append(_df)
final_df = pd.concat(res)
final_df.to_pickle(Merged_Data_Path[type]) # <-------------- 看这里
del(final_df)
为啥读取后合并完,要.to_pickle
导出为pkl文件呢?
合并数据:
magn:: 100%|██████████| 131/131 [11:44<00:00, 5.38s/it]
sound:: 100%|██████████| 131/131 [08:51<00:00, 4.05s/it]
另外注意 tqdm(usable_stations, desc=f'{type}:')
这个类,它自带一个:
,不用在desc
参数中加 :
提速百倍的Pandas性能优化方法,让你的Pandas飞起来! 中,指明,读取csv、hdf和pkl文件中,读取 pkl 格式最快
他的代码还有这样两行:
# 将该区域的地震 AETA 台站数据拿出来
local_magn_data = magn_data[magn_data['StationID'].apply(lambda x:x in ID_list)].reset_index(drop=True)
哇,直接卡的一批,慢死了!!
意思是:
idx = magn_data['StationID'].apply(lambda x:x in ID_list) # 找出 StationID 那列,元素在ID_list的行,是的话给True, 反之给False
local_magn_data = magn_data[idx] # 将那些行取出来
local_magn_data = local_magn_data.reset_index(drop=True) # 重置index
根据提速百倍的Pandas性能优化方法,让你的Pandas飞起来! 我们尝试用 .isin
方法来替换.apply(lambda x:x in ID_list)
local_magn_data = magn_data[magn_data['StationID'].isin(ID_list)].reset_index(drop=True)
替换完之后单步调试1s就通过,Yes!
res_df[f'{feature}_mean'] = None
res_df[f'{feature}_max'] = None
res_df[f'{feature}_min'] = None
res_df[f'{feature}_max_min'] = None
for i,row in res_df.iterrows():
endDay = row['Day']
startDay = endDay - window
data_se = df[(df['Day']>startDay)&(df['Day']<=endDay)][feature]
res_df[f'{feature}_mean'].iloc[i] = data_se.mean()
res_df[f'{feature}_max'].iloc[i] = data_se.max()
res_df[f'{feature}_min'].iloc[i] = data_se.min()
res_df[f'{feature}_max_min'].iloc[i] = data_se.max() - data_se.min()
这样操作是先给 res_df[f'{feature}_mean']
占个坑,但时候后边处理时,这个列是 object 的type
所以建议变成
res_df[f'{feature}_mean'] = 0.0
res_df[f'{feature}_max'] = 0.0
res_df[f'{feature}_min'] = 0.0
res_df[f'{feature}_max_min'] = 0.0
以上是读取速度加快的方式,让我想起之前kaggle上看到的 [Reducing DataFrame memory size by ~65%] 降低 Pandas 读取内存的方式:
直接用他写的这个函数就行:
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) def reduce_mem_usage(props): # 参考自: # https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65/notebook start_mem_usg = props.memory_usage().sum() / 1024**2 print("Memory usage of properties dataframe is :",start_mem_usg," MB") NAlist = [] # Keeps track of columns that have missing values filled in. for col in props.columns: if props[col].dtype != object: # 排除 strings # 打印当前列的 type print("******************************") print("Column: ",col) print("dtype before: ",props[col].dtype) # make variables for Int, max and min IsInt = False mx = props[col].max() mn = props[col].min() # Integer does not support NA, therefore, NA needs to be filled # Integer 不支持 NA, 所以要填充 if not np.isfinite(props[col]).all(): NAlist.append(col) props[col].fillna(mn-1,inplace=True) # test if column can be converted to an integer # 测试该列是否可以转化为整数 asint = props[col].fillna(0).astype(np.int64) result = (props[col] - asint) result = result.sum() # 如果差得很多, 则可以转化为 int if result > -0.01 and result < 0.01: IsInt = True # Make Integer/unsigned Integer datatypes if IsInt: if mn >= 0: if mx < 255: props[col] = props[col].astype(np.uint8) elif mx < 65535: props[col] = props[col].astype(np.uint16) elif mx < 4294967295: props[col] = props[col].astype(np.uint32) else: props[col] = props[col].astype(np.uint64) else: if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max: props[col] = props[col].astype(np.int8) elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max: props[col] = props[col].astype(np.int16) elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max: props[col] = props[col].astype(np.int32) elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max: props[col] = props[col].astype(np.int64) # Make float datatypes 32 bit, Int 转化不了则转为 float32 else: props[col] = props[col].astype(np.float32) # Print new column type print("dtype after: ",props[col].dtype) print("******************************") # Print final result print("___MEMORY USAGE AFTER COMPLETION:___") mem_usg = props.memory_usage().sum() / 1024**2 print("Memory usage is: ",mem_usg," MB") print("This is ",100*mem_usg/start_mem_usg,"% of the initial size") return props, NAlist
使用方式:
props = pd.read_csv(r"../input/properties_2016.csv") #The properties dataset
# props 为降低内存后的 DataFrame,NAlist 为存在异常值的列名字
props, NAlist = reduce_mem_usage(props)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)
还可以参考:
How to Speed up Pandas by 4x with one line of code
How to Speed Up Pandas Calculations
关于 np.isfinite
使用的方式:
>>> np.isfinite(1) True >>> np.isfinite(0) True >>> np.isfinite(np.nan) False >>> np.isfinite(np.inf) False >>> np.isfinite(np.NINF) False >>> np.isfinite([np.log(-1.),1.,np.log(0)]) array([False, True, False]) >>> x = np.array([-np.inf, 0., np.inf]) >>> y = np.array([2, 2, 2]) >>> np.isfinite(x, y) array([0, 1, 0]) >>> y array([0, 1, 0])
返回 True 如果 x 不是正无穷大、负无穷大或 NaN;否则返回 False。如果 x 是标量,则这是一个标量
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。