当前位置:   article > 正文

Python数据分析_第06课:数据清洗与初步分析_笔记_1、读取给定的数据集catering_sale.xls,找到大于4000,小于800的销售额并判定为

1、读取给定的数据集catering_sale.xls,找到大于4000,小于800的销售额并判定为属


GitHub: https://github.com/RealEmperor/Python-for-Data-Analysis

缺失值处理——拉格朗日插值法

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from scipy.interpolate import lagrange  # 导入拉格朗日插值函数

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

inputfile = 'data/catering_sale.xls'  # 销量数据路径
outputfile = 'data/sales.xls'  # 输出数据路径

data = pd.read_excel(inputfile)  # 读入数据

# 过滤异常值,将其变为空值
"""
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None  
上面这样写会有警告:
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

如果要更改原始数据,请使用单一赋值操作(loc):
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None

如果想要一个副本,请确保强制让 Pandas 创建副本:
error_data = data.copy() 
error_data.loc[(error_data[u'销量'] < 400) | (error_data[u'销量'] > 5000), u'销量'] = None

参考:https://www.jianshu.com/p/72274ccb647a
"""
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None


# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
    y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]  # 取数
    y = y[y.notnull()]  # 剔除空值
    return lagrange(y.index, list(y))(n)  # 插值并返回插值结果


# 逐个元素判断是否需要插值
for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:  # 如果为空即插值。
            """
            data[i][j] = ployinterp_column(data[i], j)  
            这样写会有警告:
            SettingWithCopyWarning: 
            A value is trying to be set on a copy of a slice from a DataFrame
            """
            data.loc[j, i] = ployinterp_column(data[i], j)

data.to_excel(outputfile)  # 输出结果,写入文件
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54

dataframe合并

#dataframe合并
# 1
df1 = DataFrame({
   'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = DataFrame({
   'key': ['a', 'b', 'd'],
                 'data2': range(3)})
print(df1)
print(df2)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
pd.merge(df1, df2)
  • 1
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
pd.merge(df1, df2, on='key')
  • 1
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
# 2
df3 = DataFrame({
   'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({
   'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
  • 1
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
pd.merge(df1, df2, how='outer')
  • 1
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
# 3
df1 = DataFrame({
   'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                 'data1': range(6)})
df2 = DataFrame({
   'key': ['a', 'b', 'a', 'b', 'd'],
                 'data2': range(5)})
print(df1)
print(df2)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
   data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
pd.merge(df1, df2, on='key', how='left')
  • 1
data1 key data2
0 0 b 1.0
1 0 b 3.0
2 1 b 1.0
3 1 b 3.0
4 2 a 0.0
5 2 a 2.0
6 3 c NaN
7 4 a 0.0
8 4 a 2.0
9 5 b 1.0
10 5 b 3.0
pd.merge(df1, df2, how='inner')
  • 1
data1 key data2
0 0 b 1
1 0 b 3
2 1 b 1
3 1 b 3
4 5 b 1
5 5 b 3
6 2 a 0
7 2 a 2
8 4 a 0
9 4 a 2
# 4
left = DataFrame({
   'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = DataFrame({
   'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
# 5
pd.merge(left, right, on='key1')
  • 1
  • 2
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
  • 1
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7

索引上的合并

# 1
left1 = DataFrame({
   'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({
   'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
pd.merge(left1, right1, left_on='key', right_index=True)
  • 1
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
  • 1
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
# 2
lefth = DataFrame({
   'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])
print(lefth)
print(righth)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
  • 1
data key1 key2 event1 event2
0 0.0 Ohio 2000 4 5
0 0.0 Ohio 2000 6 7
1 1.0 Ohio 2001 8 9
2 2.0 Ohio 2002 10 11
3 3.0 Nevada 2001 0 1
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')
  • 1
  • 2
data key1 key2 event1 event2
0 0.0 Ohio 2000 4.0 5.0
0 0.0 Ohio 2000 6.0 7.0
1 1.0 Ohio 2001 8.0 9.0
2 2.0 Ohio 2002 10.0 11.0
3 3.0 Nevada 2001 0.0 1.0
4 4.0 Nevada 2002 NaN NaN
4 NaN Nevada 2000 2.0 3.0
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
  • 1
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
# 3
left2.join(right2, how='outer')
  • 1
  • 2
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
left1.join(right1, on='key')
  • 1
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN
# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])
  • 1
  • 2
  • 3
  • 4
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
left2.join([right2, another], how='outer')
  • 1
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
b NaN NaN 7.0 8.0 NaN NaN
c 3.0 4.0 9.0 10.0 9.0 10.0
d NaN NaN 11.0 12.0 NaN NaN
e 5.0 6.0 13.0 14.0 11.0 12.0
f NaN NaN NaN NaN 16.0 17.0

轴向连接

# 1
arr = np.arange(12).reshape((3, 4))
print(arr)

np.concatenate([arr, arr], axis=1)
  • 1
  • 2
  • 3
  • 4
  • 5
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]





array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
# 2
s1 = Series([0, 1], index=
  • 1
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/541589
推荐阅读
相关标签
  

闽ICP备14008679号