赞
踩
import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas import Series, DataFrame from scipy.interpolate import lagrange # 导入拉格朗日插值函数 np.random.seed(12345) plt.rc('figure', figsize=(10, 6)) inputfile = 'data/catering_sale.xls' # 销量数据路径 outputfile = 'data/sales.xls' # 输出数据路径 data = pd.read_excel(inputfile) # 读入数据 # 过滤异常值,将其变为空值 """ data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None 上面这样写会有警告: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame 如果要更改原始数据,请使用单一赋值操作(loc): data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None 如果想要一个副本,请确保强制让 Pandas 创建副本: error_data = data.copy() error_data.loc[(error_data[u'销量'] < 400) | (error_data[u'销量'] > 5000), u'销量'] = None 参考:https://www.jianshu.com/p/72274ccb647a """ data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None # 自定义列向量插值函数 # s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5 def ployinterp_column(s, n, k=5): y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))] # 取数 y = y[y.notnull()] # 剔除空值 return lagrange(y.index, list(y))(n) # 插值并返回插值结果 # 逐个元素判断是否需要插值 for i in data.columns: for j in range(len(data)): if (data[i].isnull())[j]: # 如果为空即插值。 """ data[i][j] = ployinterp_column(data[i], j) 这样写会有警告: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame """ data.loc[j, i] = ployinterp_column(data[i], j) data.to_excel(outputfile) # 输出结果,写入文件
#dataframe合并
# 1
df1 = DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = DataFrame({
'key': ['a', 'b', 'd'],
'data2': range(3)})
print(df1)
print(df2)
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
data2 key
0 0 a
1 1 b
2 2 d
pd.merge(df1, df2)
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
pd.merge(df1, df2, on='key')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
# 2
df3 = DataFrame({
'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = DataFrame({
'rkey': ['a', 'b', 'd'],
'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
data1 | lkey | data2 | rkey | |
---|---|---|---|---|
0 | 0 | b | 1 | b |
1 | 1 | b | 1 | b |
2 | 6 | b | 1 | b |
3 | 2 | a | 0 | a |
4 | 4 | a | 0 | a |
5 | 5 | a | 0 | a |
pd.merge(df1, df2, how='outer')
data1 | key | data2 | |
---|---|---|---|
0 | 0.0 | b | 1.0 |
1 | 1.0 | b | 1.0 |
2 | 6.0 | b | 1.0 |
3 | 2.0 | a | 0.0 |
4 | 4.0 | a | 0.0 |
5 | 5.0 | a | 0.0 |
6 | 3.0 | c | NaN |
7 | NaN | d | 2.0 |
# 3
df1 = DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = DataFrame({
'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})
print(df1)
print(df2)
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 b
data2 key
0 0 a
1 1 b
2 2 a
3 3 b
4 4 d
pd.merge(df1, df2, on='key', how='left')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1.0 |
1 | 0 | b | 3.0 |
2 | 1 | b | 1.0 |
3 | 1 | b | 3.0 |
4 | 2 | a | 0.0 |
5 | 2 | a | 2.0 |
6 | 3 | c | NaN |
7 | 4 | a | 0.0 |
8 | 4 | a | 2.0 |
9 | 5 | b | 1.0 |
10 | 5 | b | 3.0 |
pd.merge(df1, df2, how='inner')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 0 | b | 3 |
2 | 1 | b | 1 |
3 | 1 | b | 3 |
4 | 5 | b | 1 |
5 | 5 | b | 3 |
6 | 2 | a | 0 |
7 | 2 | a | 2 |
8 | 4 | a | 0 |
9 | 4 | a | 2 |
# 4
left = DataFrame({
'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = DataFrame({
'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1 | key2 | lval | rval | |
---|---|---|---|---|
0 | foo | one | 1.0 | 4.0 |
1 | foo | one | 1.0 | 5.0 |
2 | foo | two | 2.0 | NaN |
3 | bar | one | 3.0 | 6.0 |
4 | bar | two | NaN | 7.0 |
# 5
pd.merge(left, right, on='key1')
key1 | key2_x | lval | key2_y | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
key1 | key2_left | lval | key2_right | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
# 1
left1 = DataFrame({
'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({
'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
group_val
a 3.5
b 7.0
pd.merge(left1, right1, left_on='key', right_index=True)
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
# 2
lefth = DataFrame({
'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'key2': [2000, 2001, 2002, 2001, 2002],
'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
print(lefth)
print(righth)
data key1 key2
0 0.0 Ohio 2000
1 1.0 Ohio 2001
2 2.0 Ohio 2002
3 3.0 Nevada 2001
4 4.0 Nevada 2002
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
data | key1 | key2 | event1 | event2 | |
---|---|---|---|---|---|
0 | 0.0 | Ohio | 2000 | 4 | 5 |
0 | 0.0 | Ohio | 2000 | 6 | 7 |
1 | 1.0 | Ohio | 2001 | 8 | 9 |
2 | 2.0 | Ohio | 2002 | 10 | 11 |
3 | 3.0 | Nevada | 2001 | 0 | 1 |
pd.merge(lefth, righth, left_on=['key1', 'key2'],
right_index=True, how='outer')
data | key1 | key2 | event1 | event2 | |
---|---|---|---|---|---|
0 | 0.0 | Ohio | 2000 | 4.0 | 5.0 |
0 | 0.0 | Ohio | 2000 | 6.0 | 7.0 |
1 | 1.0 | Ohio | 2001 | 8.0 | 9.0 |
2 | 2.0 | Ohio | 2002 | 10.0 | 11.0 |
3 | 3.0 | Nevada | 2001 | 0.0 | 1.0 |
4 | 4.0 | Nevada | 2002 | NaN | NaN |
4 | NaN | Nevada | 2000 | 2.0 | 3.0 |
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)
Ohio Nevada
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
# 3
left2.join(right2, how='outer')
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
left1.join(right1, on='key')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
1 | b | 1 | 7.0 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
---|---|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
left2.join([right2, another], how='outer')
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
---|---|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
b | NaN | NaN | 7.0 | 8.0 | NaN | NaN |
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 | NaN | NaN |
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
f | NaN | NaN | NaN | NaN | 16.0 | 17.0 |
# 1
arr = np.arange(12).reshape((3, 4))
print(arr)
np.concatenate([arr, arr], axis=1)
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
# 2
s1 = Series([0, 1], index=
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。