赞
踩
Pandas提供了一些用于将表格型数据读取为DataFrame对象的函数
文件类型
|
方法
|
csv
|
pd.read_csv(’./data./table.csv’,dtype={ },encoding='utf-8 ',nrows=)
|
csv
|
pd.read_table(’.data./table.csv’,sep=",")
|
xls/xlsx
|
pd.read_excel(’.data./table.xlsx’)
|
txt |
pd.read_table(’.data./table.txt’)
|
pd.read_csv(file,sep=‘,’, delimiter=None ,header=‘infer’, names=None,skiprows=None, nrows=None, encoding=None,index_col=None, usecols=None)
pd.read_excel(io,sheet_name = 0,header=0,names = None,index_col = None,usecols=None,squeeze = False,dtype = None)
pd.to_csv(path, sep=',', na_rep='', float_format=None, columns=None,header=True, index=True, index_label=None, mode=‘w’, encoding=None)
pd.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', columns=None, header=True,index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True,encoding=None, inf_rep='inf', verbose=True, freeze_panes=None)
数据清洗和准备
主要函数及功能
None
|
np.nan
|
None是Python自带的,其类
型为 python object。None
不能参与到任何计算中。
|
np.nan是浮点类型,能参与
到计算中,但计算的结果总
是NaN
|
- >>> type(None)
- <class 'NoneType'>
-
-
- >>> type(np.nan)
- <class 'float'>
-
- >>> np.nan+10
- NaN
- df=pd.DataFrame(np.random.randint(1,100,size=(4,6)),index=['A','B','C','D'],
- columns=['a','b','c','d','e','f'])
- df.loc['A','c']=np.nan
- df.loc['C','c']=None
- df.loc['B','d']=None
>>> df.isnull()
>>> df.notnull()
.notnull().any()/all()
或
.notnull().all(axis=0) 只要有一个为False,结果就为False
与
直接删除缺失数据
df.dropna( axis=0, how=‘any’, thresh=None, subset=None, inplace=False)
缺失数据处理实例
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],"toy": [np.nan, 'Batmobile', 'Bullwhip’],"born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]})
缺失值补充
df.fillna(value=None, method=None, axis=None, inplace=False, limit=None)
缺失数据处理实例
df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5],[np.nan, 3, np.nan, 4]], columns=list('ABCD'))
pd.drop_duplicates(subset=None, keep='first', inplace=False…)
数据转换:移除重复数据实例
df=pd.DataFrame({'A':[1,1,1,2,2,3],'B':list("aabbbc")})
- >>> df
- A B
- 1 1 a
- 2 1 a
- 3 1 b
- 4 2 b
- 5 2 b
- 6 3 c
df.replace(to_replace=None, value=None, inplace=False)
df = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [5, 6, 7, 8, 9],'C': ['a', 'b', 'c', 'd', 'e']})
pd.cut(x, bins, right=True, labels=None, retbins=False,precision=3, include_lowest=False)
数据转化离散化实例1
实例二
过滤或变换异常值(outlier)在很大程度上就是运用数组运算。
data = pd.DataFrame(np.random.randn(1000, 4))
data[(np.abs(data) > 3).any(1)]
- data[np.abs(data) > 3] = np.sign(data) * 3
- data.head()
pd.sample(n=None, frac=None, replace=False, weights=None, axis=None)
df =pd.DataFrame({'key':['b','a','b','c','a','b'],'data1':range(6)})
Pandas的矢量化字符串函数:实例
- data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com',
- 'Wes': np.nan}
- data=pd.Series(data)
层次化索引:set_index方法将普通列转成多层级索引
- >>> df = pd.DataFrame({'first':['a','a','b','b','c'], 'two':['1','2','3','1','3'],
- 'values':[1,2,3,4,5], 'values1':[11,22,33,44,55]})
-
- >>> df
- first two values values1
- 0 a 1 1 11
- 1 a 2 2 22
- 2 b 3 3 33
- 3 b 1 4 44
- 4 c 3 5 55
- >>> df.set_index(['first','two'])
- values values1
- first two
- a 1 1 11
- 2 2 22
- b 3 3 33
- 1 4 44
- c 3 5 55
层次化索引
双层列表
- data=pd.Series(np.random.randint(2,5,(10,)),index=[['大数据','大数据','大数据','物联网
- ','物联网','物联网','网络工程','网络工程','计算机','计算机'],[1,2,3,1,2,3,1,2,1,2]])
层次化索引:多层级的Series的取值
data=pd.Series(range(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
层次化索引:多层级DataFrame的取值
- df = pd.DataFrame(np.random.randint(50, 100,size=(4,4)),columns =pd. MultiIndex.from_product([[ "math" ,"
- physics" ], [ "term1" , "term2"]]),index = pd. MultiIndex. from_tuples([ ( "class1", "LiLei") , ("class1",
- "HanMeiMei" ),( "class2" , "LiLei") , ("class2" , "RuHua")]))
层次化索引:重塑层次化索引
层次化索引:重塑层次化索引实例
- data = pd.DataFrame(np.arange(6).reshape((2, 3)),index=pd.Index(['Oh', 'Co'],
- name='state'),columns=pd.Index(['one', 'two', 'three'], name='number'))
- >>> data
- number one two three
- state
- Oh 0 1 2
- Co 3 4 5
- >>> data.stack()
- state number
- Oh one 0
- two 1
- three 2
- Co one 3
- two 4
- three 5
- dtype: int32
- >>> data.stack().unstack()
- number one two three
- state
- Oh 0 1 2
- Co 3 4 5
层次化索引:两轴的层次化索引
- >>> frame = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a','b', 'b'], [1, 2, 1, 2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red','Green']])
- >>> frame
- Ohio Colorado
- Green Red Green
- a 1 0 1 2
- 2 3 4 5
- b 1 6 7 8
- 2 9 10 11
层次化索引的命名
- frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
- columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
- >>> frame.index.names = ['key1', 'key2’]
- >>> frame.columns.names = ['state', 'color']
- state Ohio Colorado
- color Green Red Green
- key1 key2
- a 1 0 1 2
- 2 3 4 5
- b 1 6 7 8
- 2 9 10 11
- Ohio Colorado
- Green Red Green
- a 1 0 1 2
- 2 3 4 5
- b 1 6 7 8
- 2 9 10 11
- >>>frame['Ohio']
- color Green Red
- key1 key2
- a 1 0 1
- 2 3 4
- b 1 6 7
- 2 9 10
层次化索引:根据级别汇总统计
pd.merge()通过键拼接列
合并数据集:pd.merge()实例1
- >>> left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'],
- 'B': ['B0', 'B1', 'B2', 'B3']})
- >>> right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'],
- 'D': ['D0', 'D1', 'D2', 'D3']})
- >>> result = pd.merge(left, right, on='key')
pd.merge()实例二
合并数据集:pd.concat()方法
合并数据集:DataFrame.append()方法
groupby()函数机制
DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, dropna=True, group_keys=True, observed=False)
- df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'], 'key2' : ['one', 'two', 'one', 'two', 'one'],
- 'data1' : np.random.randn(5), 'data2' : np.random.randn(5)})
- >>>g=df.groupby("key1")
- <pandas.core.groupby.generic.DataFrameGroupB
- y object at 0x0000023241D04B48
- 分组对象属性和方法
- >>>g.groups #分组索引
- {'a': Int64Index([0, 1, 4], dtype='int64'),
- 'b': Int64Index([2, 3], dtype='int64')}
- >>> g.size()
- key1
- a 3
- b 2
- dtype: int64
-
- >>>g.get_group('a')
- # 获取某个分组成员
- key1 key2 data1 data2
- a one 0.678844 0.722869
- a two 1.236807 -0.997396
- a one 1.373321 -1.392988
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'], 'key2' : ['one', 'two', 'one', 'two', 'one’], 'data1' : np.random.randn(5), 'data2' : np.random.randn(5)})
- >>> people = pd.DataFrame(np.arange(25).reshape((5, 5)), columns=['a', 'b', 'c',
- 'd', 'e'], index=['Joe', "Steve", 'Wes', 'Jim', 'Travis'])
- >>> people.iloc[2:3, [1, 2]] = np.nan #Add a few NA values
- >>> mapping = {'a': 'red', 'b': 'red', 'c': 'blue','d': 'blue', 'e': 'red', 'f' : 'orange'}
-
- a b c d e
- Joe 0 1.0 2.0 3 4
- Steve 5 6.0 7.0 8 9
- Wes 10 NaN NaN 13 14
- Jim 15 16.0 17.0 18 19
- Travis 20 21.0 22.0 23 24
-
- >>> people.groupby(mapping, axis=1).sum()
- blue red
- Joe 5.0 5.0
- Steve 15.0 20.0
- Wes 13.0 24.0
- Jim 35.0 50.0
- Travis 45.0 65.0
- >>> columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5,
- 1, 3]], =['cty', 'tenor'])
- >>> hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
如果传入一组函数或函数名,得到的DataFrame的列就会以相应的函数命名
例子
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。