赞
踩
NumPy 是一个 Python 包,它代表 “Numeric Python”。 它是一个由多维数组对象(矩阵)和用于处理数组的例程集合组成的库
使用NumPy,开发人员可以执行以下操作:
NumPy 通常与 SciPy(Scientific Python)和 Matplotlib(绘图库)一起使用,这种组合广泛用于替代 MatLab
引入numpy库
import numpy as np
conda env list
activate my_python
python -m pip install numpy scipy matplotlib ipython jupyter pandas sympy nose -i https://pypi.douban.com/simple/
NumPy 中定义的最重要的对象是称为 ndarray
的 N 维数组类型
它描述相同类型的元素集合,可以使用基于零的索引访问集合中的项目
ndarray
类的实例可以通过本教程后面描述的不同的数组创建例程来构造
numpy.array(object, dtype = None, copy = True, order = None, subok = False, ndmin = 0)
# type 数组的所需数据类型,可选
# copy 可选,默认为true,对象是否被复制
# order C(按行)、F(按列)或A(任意,默认)
# subok 默认情况下,返回的数组被强制为基类数组。 如果为true,则返回子类
# ndmin 指定返回数组的最小维数
几种创建array
的方式
# 1. a = np.array([1,2,3]) print(a) b = np.array([[1,2], [3,4]]) print(b) # 2. numpy._____(shape形状, dtype数据类型, order排序:'C'按行排,'F'按列排) d = np.zeros((3,3),dtype=int) # d = np.ones((3,3),dtype=int) # d = np.empty((3,3),dtype=int) print(d)#[[0 0 0] # [0 0 0] # [0 0 0]] # 3. e = np.arange(10,20,2) # (起始值,结束值,步长) print(e)# [10 12 14 16 18] f = np.arange(12).reshape((3,4)) print(f)#[[ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11]] #4. g = np.linspace(1,10,6) g = np.linspace(1,10,6).reshape((2,3)) # 将1,10分出5个数字 # 分开写 # g = np.linspace(1,10,6) # g = g.reshape((2,3)) print(g)#[[ 1. 2.8 4.6] # [ 6.4 8.2 10. ]]
Ndarray对象的声明:
a = np.array([0,1,2,3],dtype=float) b = a a[0] = 0.3 print(a)# [0.3 1. 2. 3. ] print(b is a)# True # 改变a后,b会改变 print(b)# [0.3 1. 2. 3. ] c = a c[1:3] = [4,4] print(c)# [0.3 4. 4. 3. ] # 改变c后,a也会改变 print(a)# [0.3 4. 4. 3. ] # 如果不想让他们为同一个,不关联的复制---->deep copy a = np.array([0,1,2,3],dtype=float) b = a.copy() # deep copy a[0] = 0.3 print(a)# [0.3 1. 2. 3. ] print(b)# [0. 1. 2. 3.]
序号 | 数据类型及描述 |
---|---|
1. | bool_ 存储为一个字节的布尔值(真或假) |
2. | int_ 默认整数,相当于 C 的long ,通常为int32 或int64 |
3. | intc 相当于 C 的int ,通常为int32 或int64 |
4. | intp 用于索引的整数,相当于 C 的size_t ,通常为int32 或int64 |
5. | int8 字节(-128 ~ 127) |
6. | int16 16 位整数(-32768 ~ 32767) |
7. | int32 32 位整数(-2147483648 ~ 2147483647) |
8. | int64 64 位整数(-9223372036854775808 ~ 9223372036854775807) |
9. | uint8 8 位无符号整数(0 ~ 255) |
10. | uint16 16 位无符号整数(0 ~ 65535) |
11. | uint32 32 位无符号整数(0 ~ 4294967295) |
12. | uint64 64 位无符号整数(0 ~ 18446744073709551615) |
13. | float_ float64 的简写 |
14. | float16 半精度浮点:符号位,5 位指数,10 位尾数 |
15. | float32 单精度浮点:符号位,8 位指数,23 位尾数 |
16. | float64 双精度浮点:符号位,11 位指数,52 位尾数 |
17. | complex_ complex128 的简写 |
18. | complex64 复数,由两个 32 位浮点表示(实部和虚部) |
19. | complex128 复数,由两个 64 位浮点表示(实部和虚部) |
# 设置array的数据类型
c = np.array([1,23,4],dtype=np.int) # int32,int64,float16
print(c.dtype)# int32
ndarray.shape:返回一个包含数组维度的元组,它也可以用于调整数组大小
ndarray.nidm:返回数组的维数
b = np.array([[1,2],
[3,4]])
print(b)
# 查看array的属性
print('bumber of dim:',b.ndim)
print('shape:',b.shape)
print('size:',b.size)
nadarry.itemsize:返回数组中每个元素的字节单位长度
a = np.array([1,2,3],dtype=float)
b = np.array([[1,2],[3,4]],dtype=int)
print(a.itemsize) # 8
print(b.itemsize) # 4
基本运算:
a = np.array([40,50,60,70]) b = np.linspace(1,40,4) print(a,b)# [40 50 60 70] [ 1. 14. 27. 40.] # 加减 c = a - b print(c)# [39. 36. 33. 30.] c= a + b print(c)# [ 41. 64. 87. 110.] # 幂运算 c = a**2 print(c)# [1600 2500 3600 4900] # 三角函数运算 c = 10*np.sin(a) # c = 10*np.cos(a) print(c)# [ 7.4511316 -2.62374854 -3.04810621 7.73890682] # 比较运算 print(b>10)# [False True True True] print(b==14)# [False True False False]
矩阵乘法
# 矩阵乘法 a = np.array([[1,2], [3,4]]) b = np.arange(4).reshape((2,2)) # 形式一: # *形式:两个矩阵对应位置,逐个相乘 c = a*b print(c)#[[ 0 2] # [ 6 12]] # 形式二: # np.dot形式:真正的矩阵乘法 c_dot = np.dot(a,b) # c_dot_2 = a.dot(b) print(c_dot)#[[ 4 7] # [ 8 15]]
求出一些统计信息
a = np.random.random((2,4)) print(a)#[[0.88890236 0.18332218 0.40325598 0.76729931] # [0.22830906 0.33309742 0.14649372 0.16507163]] print(np.sum(a)) # 求和 print(np.min(a)) # 最小值 print(np.max(a)) # 最大值 # axis={0:对每一列求,1:对每一列求} print(np.sum(a,axis=1)) # [2.24277983 0.87297183] print(np.min(a,axis=0)) # [0.22830906 0.18332218 0.14649372 0.16507163] print(np.max(a,axis=1)) # [0.88890236 0.33309742] # 返回特定值的索引 a = np.arange(2,14).reshape((3,4)) print(a)#[[ 2 3 4 5] # [ 6 7 8 9] # [10 11 12 13]] print(np.argmin(a)) # 列表中最小值的索引 print(np.argmin(a,axis=1)) # [0 0 0] print(np.argmax(a)) # 11 print(np.mean(a)) # 均值 7.5 print(np.average(a)) # 均值 7.5 print(np.median(a)) # 中位数 7.5 print(np.cumsum(a)) # 分别求出前几位和:[ 2 5 9 14 20 27 35 44 54 65 77 90] print(np.diff(a)) # 每行相邻两数做差 [[1 1 1] # [1 1 1] # [1 1 1]] print(np.nonzero(a)) # (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64)) # 将所有表示非零元素位置的行列号(两个值),分别放到两个array中
矩阵排序
a = np.arange(12,0,-1).reshape((3,4))
print(a)#[[12 11 10 9]
# [ 8 7 6 5]
# [ 4 3 2 1]]
print(np.sort(a))#[[ 9 10 11 12]
# [ 5 6 7 8]
# [ 1 2 3 4]]
矩阵转置
# 矩阵转置 a = np.arange(12,0,-1).reshape((3,4)) print(a)#[[12 11 10 9] # [ 8 7 6 5] # [ 4 3 2 1]] print(np.transpose(a))#[[12 8 4] # [11 7 3] # [10 6 2] # [ 9 5 1]] print((a.T).dot(a))#[[224 200 176 152] # [200 179 158 137] # [176 158 140 122] # [152 137 122 107]] # 一维数组无法使用T进行转置 print(a.T)# [1 1 1] # 升维操作:使用np.newaxis添加一个维度 print(a[np.newaxis,:])# [[1 1 1]] print(a[:,np.newaxis])# [[1] #[1] #[1]] # 在定义数组时,使用[:,np.newaxis]将一维数组转成列的形式 a = np.array([1,1,1])[:,np.newaxis] b = np.array([2,2,2])[:,np.newaxis] print(np.hstack((a,b)))# [[1 2] # [1 2] # [1 2]]
矩阵元素提取
# 对矩阵元素进行截取 print(np.clip(a,5,9))#[[9 9 9 9] # [8 7 6 5] # [5 5 5 5]] # 按索引取值 a = np.arange(3,15) print(a)# [ 3 4 5 6 7 8 9 10 11 12 13 14] print(a[3])# 6 a = a.reshape((3,4)) print(a)#[[ 3 4 5 6] # [ 7 8 9 10] # [11 12 13 14]] print(a[2])# [11 12 13 14] print(a[2][1])# 12 print(a[2,1])# 12 print(a[2,:])# :代表所有--[11 12 13 14] print(a[2,1:3])# [12 13]
矩阵遍历
# 遍历每一行 for row in a: print(row)#[3 4 5 6] #[ 7 8 9 10] #[11 12 13 14] # 遍历每一列 for column in a.T: print(column)#[ 3 7 11] # [ 4 8 12] # [ 5 9 13] # [ 6 10 14] # 遍历每一个元素 print(a.flatten())# [ 3 4 5 6 7 8 9 10 11 12 13 14] for item in a.flat: print(item)# 3 4 5 6 7 8 9 10 11 12 13 14
矩阵合并
# 1.使用vstack、hstack合并数组 # 对array进行合并操作 a = np.array([1,1,1]) b = np.array([2,2,2]) # 上下合并 c = np.vstack((a,b)) print(c)#[[1 1 1] # [2 2 2]] print(c.shape)# (2, 3) # 左右合并 d = np.hstack((a,b)) print(d)# [1 1 1 2 2 2] print(d.shape)# (6,) # 2.使用np.concatenate合并两个数组 a = np.array([1,1,1])[:,np.newaxis] b = np.array([2,2,2])[:,np.newaxis] c = np.concatenate((a,b),axis=0) print(c)#[[1] # [1] # [1] # [2] # [2] # [2]] c = np.concatenate((a,a,b,b),axis=1) print(c)# [[1 1 2 2] # [1 1 2 2] # [1 1 2 2]]
矩阵切割
a = np.arange(12).reshape((3,4)) print(a)# [[ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11]] # 1.纵向分割成2块 # 方式1: print(np.hsplit(a,2)) # 方式2: print(np.split(a,2,axis=1))# [array([[0, 1], # [4, 5], # [8, 9]]), array([[ 2, 3], # [ 6, 7], # [10, 11]])] # 2.横向分割成3块 # 方式1: print(np.vsplit(a,3)) # 方式2: print(np.split(a,3,axis=0))# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])] # 3.不等量的分割 #print(np.split(a,3,axis=1))# 这样是不生效的 print(np.array_split(a,3,axis=1)) # [array([[0, 1], # [4, 5], # [8, 9]]), array([[ 2], # [ 6], # [10]]), array([[ 3], # [ 7], # [11]])]
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
# 0 1.0
# 1 3.0
# 2 6.0
# 3 NaN
# 4 44.0
# 5 1.0
# dtype: float64
Pandas DataFrame 是一个二维的数组结构,类似二维数组
DataFrame 构造方法
pandas.DataFrame( data, index, columns, dtype, copy)
dates = pd.date_range('20210101',periods=6) print(dates) # DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', # '2021-01-05', '2021-01-06'], # dtype='datetime64[ns]', freq='D') # pd.DataFrame可以指定维度的格式(index-列名,columns-行名) df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) # randn函数返回一组样本,具有标准正态分布 print(df) # a b c d # 2021-01-01 -1.051328 0.539978 -1.561199 0.406864 # 2021-01-02 -1.955839 -0.219654 -0.302030 1.405773 # 2021-01-03 0.442162 0.150657 -0.652139 0.817653 # 2021-01-04 -0.644356 -0.897049 -1.346952 -0.964228 # 2021-01-05 -0.385833 0.648738 -0.148433 -1.284740 # 2021-01-06 0.841797 0.217677 -0.694916 -0.114585 df = pd.DataFrame(np.arange(12).reshape((3,4))) print(df) # 0 1 2 3 # 0 0 1 2 3 # 1 4 5 6 7 # 2 8 9 10 11
使用字典来规定数据格式
df = pd.DataFrame({'a':1., 'b':pd.Timestamp('20210101'), 'c':pd.Series(1,index=list(range(4)),dtype='float32'), 'd':np.array([3]*4,dtype='int32'), 'e':pd.Categorical(["test","train","test","train"]), 'f':'foo'}) print(df) # a b c d e f # 0 1.0 2021-01-01 1.0 3 test foo # 1 1.0 2021-01-01 1.0 3 train foo # 2 1.0 2021-01-01 1.0 3 test foo # 3 1.0 2021-01-01 1.0 3 train foo print(df.dtypes) # a float64 # b datetime64[ns] # c float32 # d int32 # e category # f object # dtype: object print(df.index)# Int64Index([0, 1, 2, 3], dtype='int64') print(df.columns)# Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object') print(df.values) # [[1.0 Timestamp('2021-01-01 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2021-01-01 00:00:00') 1.0 3 'train' 'foo'] # [1.0 Timestamp('2021-01-01 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2021-01-01 00:00:00') 1.0 3 'train' 'foo']]
查看数据表的基本描述
# 运算数字型的数据的一些统计数据
print(df.describe())
# a c d
# count 4.0 4.0 4.0
# mean 1.0 1.0 3.0
# std 0.0 0.0 0.0 # 方差
# min 1.0 1.0 3.0
# 25% 1.0 1.0 3.0
# 50% 1.0 1.0 3.0
# 75% 1.0 1.0 3.0
# max 1.0 1.0 3.0
数据表的转置
print(df.T)
# 0 ... 3
# a 1 ... 1
# b 2021-01-01 00:00:00 ... 2021-01-01 00:00:00
# c 1 ... 1
# d 3 ... 3
# e test ... train
# f foo ... foo
#
# [6 rows x 4 columns]
在控制台现实全部数据,不省略
#pd.set_option('display.max_rows', None) # 这里参数None可以换成你想要展示的行数或列数 pd.set_option('display.max_rows', 100) pd.set_option('display.max_columns', 100)
- 1
- 2
- 3
- 4
数据表的排序
# 1.根据行或列名排序:sort_index() # axis=1:根据列名排序,ascending=False:逆序排序 print(df.sort_index(axis=1,ascending=False)) # f e d c b a # 0 foo test 3 1.0 2021-01-01 1.0 # 1 foo train 3 1.0 2021-01-01 1.0 # 2 foo test 3 1.0 2021-01-01 1.0 # 3 foo train 3 1.0 2021-01-01 1.0 print(df.sort_index(axis=0,ascending=False)) # a b c d e f # 3 1.0 2021-01-01 1.0 3 train foo # 2 1.0 2021-01-01 1.0 3 test foo # 1 1.0 2021-01-01 1.0 3 train foo # 0 1.0 2021-01-01 1.0 3 test foo # 2.根据某行某列排序:sort_values() print(df.sort_values(by='e')) # a b c d e f # 0 1.0 2021-01-01 1.0 3 test foo # 2 1.0 2021-01-01 1.0 3 test foo # 1 1.0 2021-01-01 1.0 3 train foo # 3 1.0 2021-01-01 1.0 3 train foo
获取数据表中的数据
dates = pd.date_range('20210101',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 5 6 7 # 2021-01-03 8 9 10 11 # 2021-01-04 12 13 14 15 # 2021-01-05 16 17 18 19 # 2021-01-06 20 21 22 23 print(df['a'],df.a) # 2021-01-01 0 # 2021-01-02 4 # 2021-01-03 8 # 2021-01-04 12 # 2021-01-05 16 # 2021-01-06 20 # Freq: D, Name: a, dtype: int32 # 2021-01-01 0 # 2021-01-02 4 # 2021-01-03 8 # 2021-01-04 12 # 2021-01-05 16 # 2021-01-06 20 # Freq: D, Name: a, dtype: int32 print(df[0:3],df['20210101':'20210104']) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 5 6 7 # 2021-01-03 8 9 10 11 # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 5 6 7 # 2021-01-03 8 9 10 11 # 2021-01-04 12 13 14 15 # 按标签取值 print(df.loc['20210103']) # a 8 # b 9 # c 10 # d 11 # Name: 2021-01-03 00:00:00, dtype: int32 print(df.loc[:,['a','b']]) # a b # 2021-01-01 0 1 # 2021-01-02 4 5 # 2021-01-03 8 9 # 2021-01-04 12 13 # 2021-01-05 16 17 # 2021-01-06 20 21 print(df.loc['20210103',['a','b']]) # a 8 # b 9 # 按位置取值 print(df.iloc[3]) # a 12 # b 13 # c 14 # d 15 print(df.iloc[3,1]) # 13 # 使用位置和标签混合索引(ix已弃用) print(df.ix[:3,['a','c']]) # ':3' 表示3之前所有,'3:' 表示3之后所有 # a c # 2021-01-01 0 2 # 2021-01-02 4 6 # 2021-01-03 8 10 # 按布尔值索引 #print(df[boolean值]) print(df[df.a>8]) # a b c d # 2021-01-04 12 13 14 15 # 2021-01-05 16 17 18 19 # 2021-01-06 20 21 22 23
数据表的切片
# 切片选择
print(df.iloc[3:5,1:3])
# b c
# 2021-01-04 13 14
# 2021-01-05 17 18
# 切片选择不连续的列或行
print(df.iloc[[1,3,5],1:3])
# b c
# 2021-01-02 5 6
# 2021-01-04 13 14
# 2021-01-06 21 22
对数据表的指定位置赋值
# 赋值操作 dates = pd.date_range('20210101',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 5 6 7 # 2021-01-03 8 9 10 11 # 2021-01-04 12 13 14 15 # 2021-01-05 16 17 18 19 # 2021-01-06 20 21 22 23 df.loc['20210103','b'] = 100 print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 5 6 7 # 2021-01-03 8 100 10 11 # 2021-01-04 12 13 14 15 # 2021-01-05 16 17 18 19 # 2021-01-06 20 21 22 23 df.iloc[1,1] = 100 print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 100 6 7 # 2021-01-03 8 100 10 11 # 2021-01-04 12 13 14 15 # 2021-01-05 16 17 18 19 # 2021-01-06 20 21 22 23 df.iloc[4:,:] = 0 print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 100 6 7 # 2021-01-03 8 100 10 11 # 2021-01-04 12 13 14 15 # 2021-01-05 0 0 0 0 # 2021-01-06 0 0 0 0 # 将整个df中满足条件的值修改 df[df.loc[:,:] > 10] = 6 print(df) # a b c d # 2021-01-01 0 1 2 3 # 2021-01-02 4 6 6 7 # 2021-01-03 8 6 10 6 # 2021-01-04 6 6 6 6 # 2021-01-05 0 0 0 0 # 2021-01-06 0 0 0 0 # 将某列某行中满足条件的值修改 df.a[df.a == 0] = 9 print(df) # a b c d # 2021-01-01 9 1 2 3 # 2021-01-02 4 6 6 7 # 2021-01-03 8 6 10 6 # 2021-01-04 6 6 6 6 # 2021-01-05 9 0 0 0 # 2021-01-06 9 0 0 0
添加新的列
# 添加新的列
df['e'] = np.nan
df['f'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20210101',periods=6))
print(df)
# a b c d e f
# 2021-01-01 9 1 2 3 NaN 1
# 2021-01-02 4 6 6 7 NaN 2
# 2021-01-03 8 6 10 6 NaN 3
# 2021-01-04 6 6 6 6 NaN 4
# 2021-01-05 9 0 0 0 NaN 5
# 2021-01-06 9 0 0 0 NaN 6
处理丢失数据(空值处理)
# 处理丢失数据 dates = pd.date_range('20210101',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) df.iloc[1,2] = np.nan df.iloc[3,3] = np.nan print(df) # a b c d # 2021-01-01 0 1 2.0 3.0 # 2021-01-02 4 5 NaN 7.0 # 2021-01-03 8 9 10.0 11.0 # 2021-01-04 12 13 14.0 NaN # 2021-01-05 16 17 18.0 19.0 # 2021-01-06 20 21 22.0 23.0 # 按行丢弃缺失数据 & 有任何一个NaN就丢弃 print(df.dropna(axis=0,how='any')) # a b c d # 2021-01-01 0 1 2.0 3.0 # 2021-01-03 8 9 10.0 11.0 # 2021-01-05 16 17 18.0 19.0 # 2021-01-06 20 21 22.0 23.0 # 按行丢弃缺失数据 & 全部为NaN才丢弃 print(df.dropna(axis=0,how='all')) # a b c d # 2021-01-01 0 1 2.0 3.0 # 2021-01-02 4 5 NaN 7.0 # 2021-01-03 8 9 10.0 11.0 # 2021-01-04 12 13 14.0 NaN # 2021-01-05 16 17 18.0 19.0 # 2021-01-06 20 21 22.0 23.0 # 补全数据 print(df.fillna(value=0)) # a b c d # 2021-01-01 0 1 2.0 3.0 # 2021-01-02 4 5 0.0 7.0 # 2021-01-03 8 9 10.0 11.0 # 2021-01-04 12 13 14.0 0.0 # 2021-01-05 16 17 18.0 19.0 # 2021-01-06 20 21 22.0 23.0 # 检查是否有缺失 print(df.isnull()) # a b c d # 2021-01-01 False False False False # 2021-01-02 False False True False # 2021-01-03 False False False False # 2021-01-04 False False False True # 2021-01-05 False False False False # 2021-01-06 False False False False print(np.any(df.isnull()) == True)# True
import pandas as pd import numpy as np # 在控制台现实全部数据,不省略 #pd.set_option('display.max_rows', None) # 这里参数None可以换成你想要展示的行数或列数 pd.set_option('display.max_rows', 100) pd.set_option('display.max_columns', 100) # 数据导入 # 导入csv文件 data1 = pd.read_csv('D:\实验结果\任务调度实验\\test1.csv') print(data1) # num name age id # 0 0 asijdo 2 1121 # 1 1 asd 123 1213 # 2 2 asd 32 1231 # 3 3 afe 23 3231 # 导入excel文件 data2 = pd.read_excel('D:\实验结果\任务调度实验\蚁群运行结果1.xlsx') print(data2) #数据导出 data1.to_pickle('D:\实验结果\任务调度实验\student.pickle') data3 = pd.read_pickle('D:\实验结果\任务调度实验\student.pickle') print(data3) # num name age id # 0 0 asijdo 2 1121 # 1 1 asd 123 1213 # 2 2 asd 32 1231 # 3 3 afe 23 3231
pandas.concat
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) print(df1,df2,df3) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # a b c d # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 res = pd.concat([df1,df2,df3],axis=0) print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 # 合并之后忽略掉index然后排序 res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 # 当要合并的两个行列名不同的数组时 df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e']) print(df1,df2) # a b c d # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 # b c d e # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 res = pd.concat([df1,df2],join='inner') print(res) # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 res = pd.concat([df1,df2],join='inner',ignore_index=True) print(res) # b c d # 0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 # 5 1.0 1.0 1.0 # 左右合并两个 res = pd.concat([df1,df2],axis=1) print(res) # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 # 左右合并两个,选定一个为主 res = pd.concat([df1,df2],axis=1,join_axes=[df1.index]) print(res) # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
pandas.append
# append 添加,默认是按行添加 df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['a','b','c','d']) print(df1,df2) # a b c d # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 # a b c d # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 res = df1.append(df2,ignore_index=True) print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 df3 = pd.DataFrame(np.ones((3,4))*2,index=[2,3,4],columns=['a','b','c','d']) res = df1.append([df2,df3],ignore_index=True) print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 # 单独添加一行 df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d']) s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) res = df1.append(s1,ignore_index=True) print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 2.0 3.0 4.0 # append 按列添加 df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d']) df1['e'] = pd.Series([2,3,4],index=[1,2,3]) print(df1) # a b c d e # 1 0.0 0.0 0.0 0.0 2 # 2 0.0 0.0 0.0 0.0 3 # 3 0.0 0.0 0.0 0.0 4
pandas.merge
使用columns来merge合并
# 例一: df1 = pd.DataFrame(np.ones((3,2))*1,index=[0,1,2],columns=['a','b']) df1['c'] = pd.Series([2.,3.,4.],index=[0,1,2]) df2 = pd.DataFrame(np.ones((3,2))*3,index=[0,1,2],columns=['e','f']) df2['c'] = pd.Series([2.,3.,4.],index=[0,1,2]) print(df1) print(df2) # a b c # 0 1.0 1.0 2.0 # 1 1.0 1.0 3.0 # 2 1.0 1.0 4.0 # e f c # 0 3.0 3.0 2.0 # 1 3.0 3.0 3.0 # 2 3.0 3.0 4.0 df3 = pd.merge(df1,df2,on='c') print(df3) # a b c e f # 0 1.0 1.0 2.0 3.0 3.0 # 1 1.0 1.0 3.0 3.0 3.0 # 2 1.0 1.0 4.0 3.0 3.0 # 例二: left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right) res = pd.merge(left, right, on='key') print(res) # 考虑多个key的merge合并 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right) # key1 key2 A B # 0 K0 K0 A0 B0 # 1 K0 K1 A1 B1 # 2 K1 K0 A2 B2 # 3 K2 K1 A3 B3 # key1 key2 C D # 0 K0 K0 C0 D0 # 1 K1 K0 C1 D1 # 2 K1 K0 C2 D2 # 3 K2 K0 C3 D # 将key1,key2看作整体 # 且默认的使用inner方式,即只合并相同key的数据 # 注意:会分别合并一次,如: # left: 2 K1 K0 A2 B2 # right: 1 K1 K0 C1 D1 # 2 K1 K0 C2 D2 res = pd.merge(left,right,on=['key1','key2']) print(res) # key1 key2 A B C D # 0 K0 K0 A0 B0 C0 D0 # 1 K1 K0 A2 B2 C1 D1 # 2 K1 K0 A2 B2 C2 D2 # how = ['left','right','outer'(没有的进行填充),'inner'(留下相关的)] # 以right作为主表进行合并 res = pd.merge(left,right,on=['key1','key2'],how='right') print(res) # key1 key2 A B C D # 0 K0 K0 A0 B0 C0 D0 # 1 K1 K0 A2 B2 C1 D1 # 2 K1 K0 A2 B2 C2 D2 # 3 K2 K0 NaN NaN C3 D3
使用index来merge合并
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3']}, index=['K0', 'K2', 'K3']) print(left) print(right) # A B # K0 A0 B0 # K1 A1 B1 # K2 A2 B2 # C D # K0 C0 D0 # K2 C2 D2 # K3 C3 D3 # 使用两个数组的索引来合并 # outer方式: res1 = pd.merge(left, right, left_index=True, right_index=True, how='outer') print(res1) # A B C D # K0 A0 B0 C0 D0 # K1 A1 B1 NaN NaN # K2 A2 B2 C2 D2 # K3 NaN NaN C3 D3 # inner方式: res2 = pd.merge(left, right, left_index=True, right_index=True, how='inner') print(res2) # A B C D # K0 A0 B0 C0 D0 # K2 A2 B2 C2 D2
indicator:显示merge的方式
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) print(df1) print(df2) # col1 col_left # 0 0 a # 1 1 b # col1 col_right # 0 1 2 # 1 2 2 # 2 2 2 res1 = pd.merge(df1, df2, on='col1', how='outer', indicator=True) print(res1) # col1 col_left col_right _merge # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only # 为merge的方式命名一个名字 res2 = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') print(res2) # col1 col_left col_right indicator_column # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only
给属性添加后缀,以方便merge后进行区分
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) print(boys) print(girls) # k age # 0 K0 1 # 1 K1 2 # 2 K2 3 # k age # 0 K0 4 # 1 K0 5 # 2 K3 6 # 给属性添加后缀,以方便merge后进行区分 res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') print(res) # k age_boy age_girl # 0 K0 1 4 # 1 K0 1 5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
data.plot()
plt.show()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='Blue', label="Class 1")
data.plot.scatter(x='A', y='C', color='Green', label='Class 2', ax=ax) # ax:把两组数据结合
plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。