赞
踩
data.loc[288] = data.loc[287]
a=grade[[grade.columns[-3],grade.columns[-2],grade.columns[-1]]][0:100]
a=grade[grade.columns][0:100]
项目.xlsx是输入,俺老子要把这个按照visibility分成公开的还是私有的,然后塞到项目-公私分开.xlsx中作为两个sheet
#encoding: utf-8 import pandas as pd input = "项目.xlsx" output = '项目-公私分开.xlsx' data = pd.read_excel(input) data['visibility']=data['visibility'].astype('category'); data['Paths']=data['Paths'].astype('str'); writer = pd.ExcelWriter(output) condition= (data['visibility']== 'private') data[condition].to_excel(writer, index = False,sheet_name= '私-底') condition= (data['visibility']== 'public') data[condition].to_excel(writer, index = False,sheet_name= '公-底') writer.save() writer.close()
data[data['id']==424365]
等操作。
import pandas as pd
df = pd.DataFrame({'key1':list('ababa'),
'key2': ['one','two','one','two','one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
print(df)
data1 data2 key1 key2
0 -1.313101 -0.453361 a one
1 0.791463 1.096693 b two
2 0.462611 1.150597 a one
3 -0.216121 1.381333 b two
4 0.077367 -0.282876 a one
#将df['data1']按照分组键为df['key1']进行分组 grouped=df['data1'].groupby(df['key1']) print(grouped.mean()) key1 a -0.257707 b 0.287671 Name: data1, dtype: float64 states=np.array(['Ohio','California','California','Ohio','Ohio']) years=np.array([2005,2005,2006,2005,2006]) #states第一层索引,years第二层分层索引 print(df['data1'].groupby([states,years]).mean()) California 2005 0.791463 2006 0.462611 Ohio 2005 -0.764611 2006 0.077367 Name: data1, dtype: float64 #df根据‘key1’分组,然后对df剩余数值型的数据运算 df.groupby('key1').mean() data1 data2 key1 a -0.257707 0.138120 b 0.287671 1.239013 #可以看出没有key2列,因为df[‘key2’]不是数值数据,所以被从结果中移除。默认情况下,所有数值列都会被聚合,虽然有时可能被过滤为一个子集。
#name就是groupby中的key1的值,group就是要输出的内容
for name, group in df.groupby('key1'):
print (name,group)
a data1 data2 key1 key2
0 -1.313101 -0.453361 a one
2 0.462611 1.150597 a one
4 0.077367 -0.282876 a one
b data1 data2 key1 key2
1 0.791463 1.096693 b two
3 -0.216121 1.381333 b two
sort_values(by, axis=0, ascending=True, inplace=False, kind=‘quicksort’, na_position=‘last’)
axis:{0 or ‘index’, 1 or ‘columns’}, default 0,默认按照列排序,为1,则是横向排序。
by:str or list of str;如果axis=0,那么by=“列名”;如果axis=1,那么by=“行名”。
ascending:True则升,如果by=[‘列名1’,‘列名2’],则该参数可以是[True, False],即第一字段升序,第二个降序。
inplace:布尔型,是否用排序后的数据框替换现有的数据框。
kind:排序方法,{‘quicksort’, ‘mergesort’, ‘heapsort’}, default ‘quicksort’。似乎不用太关心。
na_position:{‘first’, ‘last’}, default ‘last’,默认缺失值排在最后面。
df = pd.DataFrame({'b':[1,2,3,2],'a':[4,3,2,1],'c':[1,3,8,2]},index=[2,0,1,3])
b a c
2 1 4 1
0 2 3 3
1 3 2 8
3 2 1 2
df.sort_values(by='b') #等同于df.sort_values(by='b',axis=0)
b a c
2 1 4 1
0 2 3 3
3 2 1 2
1 3 2 8
df.sort_values(by=['b','a'],axis=0,ascending=[False,True]) #等同于df.sort_values(by=['b','a'],axis=0,ascending=[False,True])
b a c
1 3 2 8
3 2 1 2
0 2 3 3
2 1 4 1
df.sort_values(by=3,axis=1) #必须指定axis=1
a b c
2 4 1 1
0 3 2 3
1 2 3 8
3 1 2 2
import pandas as pd
data = pd.DataFrame(columns=['a','b'], data=[[1,2],[3,4]])
print (data.values)
print (type(data.values))
[[1 2]
[3 4]]
<class 'numpy.ndarray'>
这个好啊,有多少个记录就有多少行,每列就是属性,成了数组了!每列没有列名喽
print(data.shape)
import pandas as pd
data = pd.DataFrame(columns=['a','b'], data=[[1,2],[3,4]])
data
>>> data
a b
0 1 2
1 3 4
data['c'] = ''
>>> data
a b c
0 1 2
1 3 4
data['d'] = [5,6]
data
>>> data
a b c d
0 1 2 5
1 3 4 6
import pandas as pd
df = pd.DataFrame([['a1', 1], ['a2', 4]], columns=['uid', 'score'])
print(df)
df.set_index('uid',inplace=True);
print (df)
set_index():
DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
keys:列标签或列标签/数组列表,需要设置为索引的列
drop:删除用作新索引的列
append:是否将列附加到现有索引
inplace:默认False,适当修改DataFrame(不要创建新对象)
verify_integrity:默认false,检查新索引的副本。否则,请将检查推迟到必要时进行。将其设置为false将提高该方法的性能。
import pandas as pd
df = pd.DataFrame([['a1', 1], ['a2', 4]], columns=['uid', 'score'])
print(df)
print (df['uid'])
print ("sdfdsfdfgddddddddddddddddddddddddddddddddddddd")
print (df['score'])
import pandas as pd df = pd.DataFrame([['a1', 1], ['a2', 4]], columns=['uid', 'score']) print(df) print (df['score']) df['score']=df['score'].astype('str') print (df['score'])
import pandas as pd
df = pd.DataFrame([['a1', 1], ['a2', 4]], columns=['uid', 'score'])
df['score']=df['uid'].astype('category');
print (df['score'])
drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=‘raise’)
labels是指要删除的标签,一个或者是列表形式的多个,
axis指处哪一个轴
columns是指某一列或者多列,
level是指等级,针对多重索引
inplaces是否替换原来的dataframe
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three'])
print (df)
df.drop(['one'],axis=1,inplace=True)
print (df)
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three'])
print (df)
df.drop(['a','b'],inplace=True)
print (df)
>>> df = pd.DataFrame(np.arange(12).reshape(3,4), ... columns=['A', 'B', 'C', 'D']) >>> df A B C D 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 #指定删除相关的列,没有带columns,所以要指出是哪个轴上的 >>> df.drop(['B', 'C'], axis=1) A D 0 0 3 1 4 7 2 8 11 #这里带有columns,所以不用加上axis参数 >>> df.drop(columns=['B', 'C']) A D 0 0 3 1 4 7 2 8 11 #删除指定索引的行,这里没有axis参数,就是默认axis=0,也就是删除行 >>> df.drop([0, 1]) A B C D 2 8 9 10 11
print (ad.columns)
Index(['Unnamed: 0', 'creative_id', 'ad_id', 'product_id', 'product_category',
'advertiser_id', 'industry'],
dtype='object')
print (type(ad.columns))
<class 'pandas.core.indexes.base.Index'>
ad.isnull().any()
ad.isnull()
import pandas as pd import numpy as np data=pd.DataFrame(np.random.randn(10000,4)) print (data.head()) print (data.describe()) print (".....................................\n") print (data[(np.abs(data)>3).any(1)]) print (".....................................\n") print (np.sign(data)*3) print (".....................................\n") data[(np.abs(data)>3)]=np.sign(data)*3 print (data[(np.abs(data)>3).any(1)])
import pandas as pd import numpy as np data=pd.DataFrame(np.random.randn(10000,4)) print (data[0][0]) data1=data data1[0][0]=12 print (data[0][0]) -0.2163364765902928 12.0
import pandas as pd import numpy as np data=pd.DataFrame(np.random.randn(10000,4)) print (data[0][0]) data1=data.copy() data1[0][0]=12 print (data[0][0]) -1.323364816178571 -1.323364816178571
import pandas as pd import numpy as np data=pd.DataFrame(np.random.randn(8,4)) print (data) data.drop([0,1,2], axis=0, inplace=True) print (data) 0 1 2 3 0 -1.356098 -2.757676 1.298255 0.680684 1 -0.850784 -0.499739 0.735980 -1.367757 2 -0.136712 -1.517955 1.732635 -0.616452 3 -0.333158 0.247137 -0.778323 -0.619491 4 0.447829 0.319978 1.584691 -0.327392 5 0.842971 1.618222 -0.309938 -1.128479 6 1.317082 -1.590495 0.968119 0.699859 7 -0.815379 -0.034858 -2.285859 -1.610159 0 1 2 3 3 -0.333158 0.247137 -0.778323 -0.619491 4 0.447829 0.319978 1.584691 -0.327392 5 0.842971 1.618222 -0.309938 -1.128479 6 1.317082 -1.590495 0.968119 0.699859 7 -0.815379 -0.034858 -2.285859 -1.610159
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。