赞
踩
#pandas和其他分析库的结合点通常是NumPy数组。
#要将DataFrame转换为NumPy数组,使用.values属性
import pandas as pd
import numpy as np
data = pd.DataFrame({'x0':[1,2,3,4,5],
'x1':[0.01,-0.01,0.25,-4.1,0],
'y':[-1.5,0.,3.6,1.3,-2]})
data
x0 | x1 | y | |
---|---|---|---|
0 | 1 | 0.01 | -1.5 |
1 | 2 | -0.01 | 0.0 |
2 | 3 | 0.25 | 3.6 |
3 | 4 | -4.10 | 1.3 |
4 | 5 | 0.00 | -2.0 |
data.columns
Index(['x0', 'x1', 'y'], dtype='object')
data.values
array([[ 1. , 0.01, -1.5 ],
[ 2. , -0.01, 0. ],
[ 3. , 0.25, 3.6 ],
[ 4. , -4.1 , 1.3 ],
[ 5. , 0. , -2. ]])
#将数组再转换为DataFrame
df2 = pd.DataFrame(data.values,columns=['one','two','three'])
df2
one | two | three | |
---|---|---|---|
0 | 1.0 | 0.01 | -1.5 |
1 | 2.0 | -0.01 | 0.0 |
2 | 3.0 | 0.25 | 3.6 |
3 | 4.0 | -4.10 | 1.3 |
4 | 5.0 | 0.00 | -2.0 |
# .values属性一般在你的数据是同构化的时候使用
df3 = data.copy()
df3['strings'] = ['a','b','c','d','e']
df3
x0 | x1 | y | strings | |
---|---|---|---|---|
0 | 1 | 0.01 | -1.5 | a |
1 | 2 | -0.01 | 0.0 | b |
2 | 3 | 0.25 | 3.6 | c |
3 | 4 | -4.10 | 1.3 | d |
4 | 5 | 0.00 | -2.0 | e |
df3.values
array([[1, 0.01, -1.5, 'a'],
[2, -0.01, 0.0, 'b'],
[3, 0.25, 3.6, 'c'],
[4, -4.1, 1.3, 'd'],
[5, 0.0, -2.0, 'e']], dtype=object)
#对于某些模型,你可能只想使用一部分列。我推荐使用loc索引和values
model_cols = ['x0','x1']
data.loc[:,model_cols].values
array([[ 1. , 0.01],
[ 2. , -0.01],
[ 3. , 0.25],
[ 4. , -4.1 ],
[ 5. , 0. ]])
data['category'] = pd.Categorical(['a','b','a','a','b'],categories=['a','b'])
data
x0 | x1 | y | category | |
---|---|---|---|---|
0 | 1 | 0.01 | -1.5 | a |
1 | 2 | -0.01 | 0.0 | b |
2 | 3 | 0.25 | 3.6 | a |
3 | 4 | -4.10 | 1.3 | a |
4 | 5 | 0.00 | -2.0 | b |
#如果我们想使用虚拟变量替代’category’列,我们先创建虚拟变量,
#之后删除’categroy’列,然后连接结果
dummies = pd.get_dummies(data.category,prefix='category')
dummies
category_a | category_b | |
---|---|---|
0 | 1 | 0 |
1 | 0 | 1 |
2 | 1 | 0 |
3 | 1 | 0 |
4 | 0 | 1 |
data_with_dummies = data.drop('category',axis=1).join(dummies)
data_with_dummies
x0 | x1 | y | category_a | category_b | |
---|---|---|---|---|---|
0 | 1 | 0.01 | -1.5 | 1 | 0 |
1 | 2 | -0.01 | 0.0 | 0 | 1 |
2 | 3 | 0.25 | 3.6 | 1 | 0 |
3 | 4 | -4.10 | 1.3 | 1 | 0 |
4 | 5 | 0.00 | -2.0 | 0 | 1 |
data = pd.DataFrame({'x0':[1,2,3,4,5],
'x1':[0.01,-0.01,0.25,-4.1,0],
'y':[-1.5,0.,3.6,1.3,-2]})
data
x0 | x1 | y | |
---|---|---|---|
0 | 1 | 0.01 | -1.5 |
1 | 2 | -0.01 | 0.0 |
2 | 3 | 0.25 | 3.6 |
3 | 4 | -4.10 | 1.3 |
4 | 5 | 0.00 | -2.0 |
import patsy
y,X = patsy.dmatrices('y~x0+x1',data)
y
DesignMatrix with shape (5, 1)
y
-1.5
0.0
3.6
1.3
-2.0
Terms:
'y' (column 0)
X
DesignMatrix with shape (5, 3)
Intercept x0 x1
1 1 0.01
1 2 -0.01
1 3 0.25
1 4 -4.10
1 5 0.00
Terms:
'Intercept' (column 0)
'x0' (column 1)
'x1' (column 2)
np.asarray(y)
array([[-1.5],
[ 0. ],
[ 3.6],
[ 1.3],
[-2. ]])
np.asarray(X)
array([[ 1. , 1. , 0.01],
[ 1. , 2. , -0.01],
[ 1. , 3. , 0.25],
[ 1. , 4. , -4.1 ],
[ 1. , 5. , 0. ]])
#你可以通过给模型添加名词列+0来加入截距
patsy.dmatrices('y~x0+x1+0',data)[1]
DesignMatrix with shape (5, 2)
x0 x1
1 0.01
2 -0.01
3 0.25
4 -4.10
5 0.00
Terms:
'x0' (column 0)
'x1' (column 1)
coef,resid,_,_ = np.linalg.lstsq(X,y)
coef
array([[ 0.31290976],
[-0.07910564],
[-0.26546384]])
coef = pd.Series(coef.squeeze(),index=X.design_info.column_names)
coef
Intercept 0.312910
x0 -0.079106
x1 -0.265464
dtype: float64
y,X = patsy.dmatrices('y~x0+np.log(np.abs(x1+1))',data)
X
DesignMatrix with shape (5, 3)
Intercept x0 np.log(np.abs(x1 + 1))
1 1 0.00995
1 2 -0.01005
1 3 0.22314
1 4 1.13140
1 5 0.00000
Terms:
'Intercept' (column 0)
'x0' (column 1)
'np.log(np.abs(x1 + 1))' (column 2)
#一些常用的变量转换包括标准化(对均值0和方差1)和居中(减去平均值)
y,X = patsy.dmatrices('y~standardize(x0)+center(x1)',data)
X
DesignMatrix with shape (5, 3)
Intercept standardize(x0) center(x1)
1 -1.41421 0.78
1 -0.70711 0.76
1 0.00000 1.02
1 0.70711 -3.33
1 1.41421 0.77
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)
#patsy.build_design_matrices函数可以使用原始样本内数据集中保存的信息将变换应用于新的样本外数据上
new_data = pd.DataFrame({'x0':[6,7,8,9],
'x1':[3.1,-0.5,0,2.3],
'y':[1,2,3,4]})
new_X = patsy.build_design_matrices([X.design_info],new_data)
new_X
[DesignMatrix with shape (4, 3)
Intercept standardize(x0) center(x1)
1 2.12132 3.87
1 2.82843 0.27
1 3.53553 0.77
1 4.24264 3.07
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)]
y,X = patsy.dmatrices('y~I(x0+x1)',data)
X
DesignMatrix with shape (5, 2)
Intercept I(x0 + x1)
1 1.01
1 1.99
1 3.25
1 -0.10
1 5.00
Terms:
'Intercept' (column 0)
'I(x0 + x1)' (column 1)
data = pd.DataFrame({'key1':['a','a','b','b','a','b','a','b'],
'key2':[0,1,0,1,0,1,0,0],
'v1':[1,2,3,4,5,6,7,8],
'v2':[-1,0,2.5,-0.5,4.0,-1.2,0.2,-1.7]})
y,X = patsy.dmatrices('v2~key1',data)
X
DesignMatrix with shape (8, 2)
Intercept key1[T.b]
1 0
1 0
1 1
1 1
1 0
1 1
1 0
1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
y,X = patsy.dmatrices('v2~key1+0',data)
X
DesignMatrix with shape (8, 2)
key1[a] key1[b]
1 0
1 0
0 1
0 1
1 0
0 1
1 0
0 1
Terms:
'key1' (columns 0:2)
#数字类型列可以使用C函数解释为分类类型
y,X = patsy.dmatrices('v2~C(key2)',data)
X
DesignMatrix with shape (8, 2)
Intercept C(key2)[T.1]
1 0
1 1
1 0
1 1
1 0
1 1
1 0
1 0
Terms:
'Intercept' (column 0)
'C(key2)' (column 1)
data['key2'] = data['key2'].map({0:'zero',1:'one'})
data
key1 | key2 | v1 | v2 | |
---|---|---|---|---|
0 | a | zero | 1 | -1.0 |
1 | a | one | 2 | 0.0 |
2 | b | zero | 3 | 2.5 |
3 | b | one | 4 | -0.5 |
4 | a | zero | 5 | 4.0 |
5 | b | one | 6 | -1.2 |
6 | a | zero | 7 | 0.2 |
7 | b | zero | 8 | -1.7 |
y,X = patsy.dmatrices('v2~key1+key2',data)
X
DesignMatrix with shape (8, 3)
Intercept key1[T.b] key2[T.zero]
1 0 1
1 0 0
1 1 1
1 1 0
1 0 1
1 1 0
1 0 1
1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
y,X = patsy.dmatrices('v2~key1+key2+key1:key2',data)
X
DesignMatrix with shape (8, 4)
Intercept key1[T.b] key2[T.zero] key1[T.b]:key2[T.zero]
1 0 1 0
1 0 0 0
1 1 1 1
1 1 0 0
1 0 1 0
1 1 0 0
1 0 1 0
1 1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
'key1:key2' (column 3)
import statsmodels.api as sm
import statsmodels.formula.api as smf
def dnorm(mean,variance,size=1):
if isinstance(size,int):
size = size,
return mean+np.sqrt(variance)*np.random.randn(*size)
np.random.seed(12345)
N = 100
X = np.c_[dnorm(0,0.4,size=N),
dnorm(0,0.6,size=N),
dnorm(0,0.2,size=N)]
eps = dnorm(0,0.1,size=N)
beta = [0.1,0.3,0.5]
y = np.dot(X,beta)+eps
X[:5]
array([[-0.12946849, -1.21275292, 0.50422488],
[ 0.30291036, -0.43574176, -0.25417986],
[-0.32852189, -0.02530153, 0.13835097],
[-0.35147471, -0.71960511, -0.25821463],
[ 1.2432688 , -0.37379916, -0.52262905]])
y[:5]
array([ 0.42786349, -0.67348041, -0.09087764, -0.48949442, -0.12894109])
#线性模型通常与我们在Patsy中看到的截距项相匹配。sm.add_constant函数可以将截距列添加到现有矩阵
X_model = sm.add_constant(X)
X_model[:5]
array([[ 1. , -0.12946849, -1.21275292, 0.50422488],
[ 1. , 0.30291036, -0.43574176, -0.25417986],
[ 1. , -0.32852189, -0.02530153, 0.13835097],
[ 1. , -0.35147471, -0.71960511, -0.25821463],
[ 1. , 1.2432688 , -0.37379916, -0.52262905]])
#sm.OLS类可以拟合一个最小二乘线性回归
model = sm.OLS(y,X)
results = model.fit()
results.params
array([0.17826108, 0.22303962, 0.50095093])
print(results.summary())
OLS Regression Results ======================================================================================= Dep. Variable: y R-squared (uncentered): 0.430 Model: OLS Adj. R-squared (uncentered): 0.413 Method: Least Squares F-statistic: 24.42 Date: Thu, 30 Dec 2021 Prob (F-statistic): 7.44e-12 Time: 11:09:06 Log-Likelihood: -34.305 No. Observations: 100 AIC: 74.61 Df Residuals: 97 BIC: 82.42 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x1 0.1783 0.053 3.364 0.001 0.073 0.283 x2 0.2230 0.046 4.818 0.000 0.131 0.315 x3 0.5010 0.080 6.237 0.000 0.342 0.660 ============================================================================== Omnibus: 4.662 Durbin-Watson: 2.201 Prob(Omnibus): 0.097 Jarque-Bera (JB): 4.098 Skew: 0.481 Prob(JB): 0.129 Kurtosis: 3.243 Cond. No. 1.74 ============================================================================== Notes: [1] R² is computed without centering (uncentered) since the model does not contain a constant. [2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data = pd.DataFrame(X,columns=['col0','col1','col2'])
data['y'] = y
data[:5]
col0 | col1 | col2 | y | |
---|---|---|---|---|
0 | -0.129468 | -1.212753 | 0.504225 | 0.427863 |
1 | 0.302910 | -0.435742 | -0.254180 | -0.673480 |
2 | -0.328522 | -0.025302 | 0.138351 | -0.090878 |
3 | -0.351475 | -0.719605 | -0.258215 | -0.489494 |
4 | 1.243269 | -0.373799 | -0.522629 | -0.128941 |
#可以使用statsmodels公式API和Patsy公式字符串
results = smf.ols('y~col0+col1+col2',data=data).fit()
results.params
Intercept 0.033559
col0 0.176149
col1 0.224826
col2 0.514808
dtype: float64
results.tvalues
Intercept 0.952188
col0 3.319754
col1 4.850730
col2 6.303971
dtype: float64
results.predict(data[:5])
0 -0.002327
1 -0.141904
2 0.041226
3 -0.323070
4 -0.100535
dtype: float64
init_x = 4
import random
values = [init_x,init_x]
N = 1000
b0 = 0.8
b1 = -0.4
noise = dnorm(0,0.1,N)
for i in range(N):
new_x = values[-1]*b0+values[-2]*b1+noise[i]
values.append(new_x)
MAXLAGX = 5
model = sm.tsa.AR(values)
results = model.fit(MAXLAGX)
results.params
array([-0.00616093, 0.78446347, -0.40847891, -0.01364148, 0.01496872,
0.01429462])
train = pd.read_csv('datasets/titanic/train.csv')
test = pd.read_csv('datasets/titanic/test.csv')
train[:4]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
train.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
test.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)
train['IsFemale'] = (train['Sex']=='female').astype(int)
test['IsFemale'] = (test['Sex']=='female').astype(int)
train[:5]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | IsFemale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 1 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 1 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 0 |
test[:5]
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | IsFemale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q | 0 |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S | 1 |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q | 0 |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S | 0 |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S | 1 |
predictors = ['Pclass','IsFemale','Age']
X_train = train[predictors].values
X_train[:5]
array([[ 3., 0., 22.],
[ 1., 1., 38.],
[ 3., 1., 26.],
[ 1., 1., 35.],
[ 3., 0., 35.]])
X_test = test[predictors].values
X_test[:5]
array([[ 3. , 0. , 34.5],
[ 3. , 1. , 47. ],
[ 2. , 0. , 62. ],
[ 3. , 0. , 27. ],
[ 3. , 1. , 22. ]])
y_train = train['Survived'].values
y_train[:5]
array([0, 1, 1, 1, 0], dtype=int64)
#使用scikit-learn的LogisticRegression模型创建一个模型实例
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
LogisticRegression()
y_predict = model.predict(X_test)
y_predict[:10]
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train,y_train)
LogisticRegressionCV()
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=10)
scores = cross_val_score(model,X_train,y_train,cv=4)
scores
array([0.77578475, 0.79820628, 0.77578475, 0.78828829])
path = 'datasets/bitly_usagov/example.txt'
open(path).readline()
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
import json
path = 'datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path)]
records[0]
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', 'c': 'US', 'nk': 1, 'tz': 'America/New_York', 'gr': 'MA', 'g': 'A6qOVH', 'h': 'wfLQtf', 'l': 'orofrog', 'al': 'en-US,en;q=0.8', 'hh': '1.usa.gov', 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991', 't': 1331923247, 'hc': 1331822918, 'cy': 'Danvers', 'll': [42.576698, -70.954903]}
time_zones = [rec['tz'] for rec in records]
#结果并不是所有的记录都有时区字段,导致出错
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-6-767e72d0f2fa> in <module>
----> 1 time_zones = [rec['tz'] for rec in records]
2 #结果并不是所有的记录都有时区字段,导致出错
<ipython-input-6-767e72d0f2fa> in <listcomp>(.0)
----> 1 time_zones = [rec['tz'] for rec in records]
2 #结果并不是所有的记录都有时区字段,导致出错
KeyError: 'tz'
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
#计数的一种方法是在遍历时区时使用字典来存储计数
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x]+=1
else :
counts[x]=1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int)#值将会初始化为0
for x in sequence:
counts[x] +=1
return counts
counts = get_counts(time_zones)
counts['America/New_York']
1251
len(time_zones)
3440
#如果我们想要前十的时区和它们的计数
def top_counts(count_dict,n=10):
value_key_pairs = [(count,tz) for tz,count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
#如果我们搜索Python标准库,你可能会发现collections.Counter类,它可以使任务更加简单
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
import pandas as pd
frame = pd.DataFrame(records)
frame.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3560 entries, 0 to 3559 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 3440 non-null object 1 c 2919 non-null object 2 nk 3440 non-null float64 3 tz 3440 non-null object 4 gr 2919 non-null object 5 g 3440 non-null object 6 h 3440 non-null object 7 l 3440 non-null object 8 al 3094 non-null object 9 hh 3440 non-null object 10 r 3440 non-null object 11 u 3440 non-null object 12 t 3440 non-null float64 13 hc 3440 non-null float64 14 cy 2919 non-null object 15 ll 2919 non-null object 16 _heartbeat_ 120 non-null float64 17 kw 93 non-null object dtypes: float64(4), object(14) memory usage: 500.8+ KB
frame['tz'][:10]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
#对于Series,我们可以使用value_counts方法
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
frame['tz'][:10]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
#用fillna方法替换缺失值,并为空字符串使用布尔数组索引
clean_tz = frame['tz'].fillna('Missing')
clean_tz[:10]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
clean_tz[clean_tz ==''] = 'Unknown'
clean_tz[:10]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7 Unknown
8 Unknown
9 Unknown
Name: tz, dtype: object
tz_counts = clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
import seaborn as sns
subset = tz_counts[:10]
sns.barplot(y=subset.index,x=subset.values)
#a列包含了执行网址缩短的浏览器、设备或应用的信息
frame['a'][1]
'GoogleMaps/RochesterNY'
frame['a'][50]
'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
#默认空格分割
frame['a'][51].split('; ')
['Mozilla/5.0 (Linux',
'U',
'Android 2.2.2',
'en-us',
'LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1']
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
dtype: object
results.value_counts()[:8]
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
frame.a.notnull()[-10:]
3550 True
3551 True
3552 True
3553 True
3554 True
3555 True
3556 True
3557 True
3558 True
3559 True
Name: a, dtype: bool
#由于一些代理字符串的缺失,我们将从数据中排除这些代理字符串
cframe = frame[frame.a.notnull()]
cframe[:10]
a | c | nk | tz | gr | g | h | l | al | hh | r | u | t | hc | cy | ll | _heartbeat_ | kw | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | US | 1.0 | America/New_York | MA | A6qOVH | wfLQtf | orofrog | en-US,en;q=0.8 | 1.usa.gov | http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | http://www.ncbi.nlm.nih.gov/pubmed/22415991 | 1.331923e+09 | 1.331823e+09 | Danvers | [42.576698, -70.954903] | NaN | NaN |
1 | GoogleMaps/RochesterNY | US | 0.0 | America/Denver | UT | mwszkS | mwszkS | bitly | NaN | j.mp | http://www.AwareMap.com/ | http://www.monroecounty.gov/etc/911/rss.php | 1.331923e+09 | 1.308262e+09 | Provo | [40.218102, -111.613297] | NaN | NaN |
2 | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | US | 1.0 | America/New_York | DC | xxr3Qb | xxr3Qb | bitly | en-US | 1.usa.gov | http://t.co/03elZC4Q | http://boxer.senate.gov/en/press/releases/0316... | 1.331923e+09 | 1.331920e+09 | Washington | [38.9007, -77.043098] | NaN | NaN |
3 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | BR | 0.0 | America/Sao_Paulo | 27 | zCaLwp | zUtuOu | alelex88 | pt-br | 1.usa.gov | direct | http://apod.nasa.gov/apod/ap120312.html | 1.331923e+09 | 1.331923e+09 | Braz | [-23.549999, -46.616699] | NaN | NaN |
4 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | US | 0.0 | America/New_York | MA | 9b6kNl | 9b6kNl | bitly | en-US,en;q=0.8 | bit.ly | http://www.shrewsbury-ma.gov/selco/ | http://www.shrewsbury-ma.gov/egov/gallery/1341... | 1.331923e+09 | 1.273672e+09 | Shrewsbury | [42.286499, -71.714699] | NaN | NaN |
5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | US | 0.0 | America/New_York | MA | axNK8c | axNK8c | bitly | en-US,en;q=0.8 | bit.ly | http://www.shrewsbury-ma.gov/selco/ | http://www.shrewsbury-ma.gov/egov/gallery/1341... | 1.331923e+09 | 1.273673e+09 | Shrewsbury | [42.286499, -71.714699] | NaN | NaN |
6 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... | PL | 0.0 | Europe/Warsaw | 77 | wcndER | zkpJBR | bnjacobs | pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4 | 1.usa.gov | http://plus.url.google.com/url?sa=z&n=13319232... | http://www.nasa.gov/mission_pages/nustar/main/... | 1.331923e+09 | 1.331923e+09 | Luban | [51.116699, 15.2833] | NaN | NaN |
7 | Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2... | None | 0.0 | NaN | wcndER | zkpJBR | bnjacobs | bg,en-us;q=0.7,en;q=0.3 | 1.usa.gov | http://www.facebook.com/ | http://www.nasa.gov/mission_pages/nustar/main/... | 1.331923e+09 | 1.331923e+09 | NaN | NaN | NaN | NaN | |
8 | Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1... | None | 0.0 | NaN | wcndER | zkpJBR | bnjacobs | en-US, en | 1.usa.gov | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | http://www.nasa.gov/mission_pages/nustar/main/... | 1.331923e+09 | 1.331923e+09 | NaN | NaN | NaN | NaN | |
9 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | None | 0.0 | NaN | zCaLwp | zUtuOu | alelex88 | pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4 | 1.usa.gov | http://t.co/o1Pd0WeV | http://apod.nasa.gov/apod/ap120312.html | 1.331923e+09 | 1.331923e+09 | NaN | NaN | NaN | NaN |
#之后我们想要计算一个代表每一行是否是Windows的值
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
<ipython-input-69-02329ab5f824>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
cframe['os'] [:10]
0 Windows
1 Not Windows
2 Windows
3 Not Windows
4 Windows
5 Windows
6 Windows
7 Windows
8 Not Windows
9 Windows
Name: os, dtype: object
#可以根据时区列以及新生成的操作系统列对数据进行分组
by_tz_os = cframe.groupby(['tz','os'])
#与value_counts函数类似,分组计数可以使用size计算。然后可以使用unstack对计算结果进行重塑
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
os | Not Windows | Windows |
---|---|---|
tz | ||
245.0 | 276.0 | |
Africa/Cairo | 0.0 | 3.0 |
Africa/Casablanca | 0.0 | 1.0 |
Africa/Ceuta | 0.0 | 2.0 |
Africa/Johannesburg | 0.0 | 1.0 |
Africa/Lusaka | 0.0 | 1.0 |
America/Anchorage | 4.0 | 1.0 |
America/Argentina/Buenos_Aires | 1.0 | 0.0 |
America/Argentina/Cordoba | 0.0 | 1.0 |
America/Argentina/Mendoza | 0.0 | 1.0 |
#最后,让我们选出总体计数最高的时区。
#要实现这个功能,我在agg_counts中根据行的计数构造了一个间接索引数组
#用于升序排列
indexer = agg_counts.sum(1).argsort()
indexer[:10]
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
#使用take方法按顺序选出行,之后再对最后10行进行切片(最大的10个值)
count_subset = agg_counts.take(indexer[-10:])
count_subset
os | Not Windows | Windows |
---|---|---|
tz | ||
America/Sao_Paulo | 13.0 | 20.0 |
Europe/Madrid | 16.0 | 19.0 |
Pacific/Honolulu | 0.0 | 36.0 |
Asia/Tokyo | 2.0 | 35.0 |
Europe/London | 43.0 | 31.0 |
America/Denver | 132.0 | 59.0 |
America/Los_Angeles | 130.0 | 252.0 |
America/Chicago | 115.0 | 285.0 |
245.0 | 276.0 | |
America/New_York | 339.0 | 912.0 |
#pandas有一个便捷的方法叫作nlargest,可以做同样的事情
agg_counts.sum(1).nlargest(10)
tz
America/New_York 1251.0
521.0
America/Chicago 400.0
America/Los_Angeles 382.0
America/Denver 191.0
Europe/London 74.0
Asia/Tokyo 37.0
Pacific/Honolulu 36.0
Europe/Madrid 35.0
America/Sao_Paulo 33.0
dtype: float64
#对绘图数据重新排列
count_subset = count_subset.stack()
count_subset
tz os America/Sao_Paulo Not Windows 13.0 Windows 20.0 Europe/Madrid Not Windows 16.0 Windows 19.0 Pacific/Honolulu Not Windows 0.0 Windows 36.0 Asia/Tokyo Not Windows 2.0 Windows 35.0 Europe/London Not Windows 43.0 Windows 31.0 America/Denver Not Windows 132.0 Windows 59.0 America/Los_Angeles Not Windows 130.0 Windows 252.0 America/Chicago Not Windows 115.0 Windows 285.0 Not Windows 245.0 Windows 276.0 America/New_York Not Windows 339.0 Windows 912.0 dtype: float64
count_subset.name = 'total'
count_subset = count_subset.reset_index()
count_subset[:10]
tz | os | total | |
---|---|---|---|
0 | America/Sao_Paulo | Not Windows | 13.0 |
1 | America/Sao_Paulo | Windows | 20.0 |
2 | Europe/Madrid | Not Windows | 16.0 |
3 | Europe/Madrid | Windows | 19.0 |
4 | Pacific/Honolulu | Not Windows | 0.0 |
5 | Pacific/Honolulu | Windows | 36.0 |
6 | Asia/Tokyo | Not Windows | 2.0 |
7 | Asia/Tokyo | Windows | 35.0 |
8 | Europe/London | Not Windows | 43.0 |
9 | Europe/London | Windows | 31.0 |
sns.barplot(x='total',y='tz',hue='os',data=count_subset)
#上图不容易看到较小组中的Windows用户的相对百分比,因此让我们将组百分比归一化为1
def norm_total(group):
group['normed_total'] = group.total/group.total.sum()
return group
results = count_subset.groupby('tz').apply(norm_total)
results[:10]
tz | os | total | normed_total | |
---|---|---|---|---|
0 | America/Sao_Paulo | Not Windows | 13.0 | 0.393939 |
1 | America/Sao_Paulo | Windows | 20.0 | 0.606061 |
2 | Europe/Madrid | Not Windows | 16.0 | 0.457143 |
3 | Europe/Madrid | Windows | 19.0 | 0.542857 |
4 | Pacific/Honolulu | Not Windows | 0.0 | 0.000000 |
5 | Pacific/Honolulu | Windows | 36.0 | 1.000000 |
6 | Asia/Tokyo | Not Windows | 2.0 | 0.054054 |
7 | Asia/Tokyo | Windows | 35.0 | 0.945946 |
8 | Europe/London | Not Windows | 43.0 | 0.581081 |
9 | Europe/London | Windows | 31.0 | 0.418919 |
sns.barplot(x='normed_total',y='tz',hue='os',data=results)
#可以通过transform方法和groupby方法更有效地计算归一化之和
g = count_subset.groupby('tz')
results2 = count_subset.total/g.total.transform('sum')
results2[:10]
0 0.393939
1 0.606061
2 0.457143
3 0.542857
4 0.000000
5 1.000000
6 0.054054
7 0.945946
8 0.581081
9 0.418919
Name: total, dtype: float64
import pandas as pd
#让展示内容少一点
pd.options.display.max_rows = 10
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('datasets/movielens/users.dat',sep='::',header=None,names=unames)
users[:5]
<ipython-input-101-ffe8596a8cfd>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
users = pd.read_table('datasets/movielens/users.dat',sep='::',header=None,names=unames)
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat',sep='::',header=None,names=rnames)
ratings[:5]
<ipython-input-103-bafd8ea1cf17>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
ratings = pd.read_table('datasets/movielens/ratings.dat',sep='::',header=None,names=rnames)
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
mnames = ['movie_id','title','genres']
movies = pd.read_table('datasets/movielens/movies.dat',sep='::',header=None,names=mnames)
movies[:5]
<ipython-input-118-35e3f9b1d007>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
movies = pd.read_table('datasets/movielens/movies.dat',sep='::',header=None,names=mnames)
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
#使用pandas的合并功能,我们首先将ratings表与users表合并,然后将该结果与movies表数据合并。
#pandas根据重叠名称推断哪些列用作合并的(或连接)键位
data = pd.merge(pd.merge(ratings,users),movies)
data[:5]
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
data.iloc[0]
user_id 1
movie_id 1193
rating 5
timestamp 978300760
gender F
age 1
occupation 10
zip 48067
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 0, dtype: object
#为了获得按性别分级的每部电影的平均电影评分,我们可以使用pivot_table方法
mean_ratings = data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
... | ... | ... |
Zed & Two Noughts, A (1985) | 3.500000 | 3.380952 |
Zero Effect (1998) | 3.864407 | 3.723140 |
Zero Kelvin (Kjærlighetens kjøtere) (1995) | NaN | 3.500000 |
Zeus and Roxanne (1997) | 2.777778 | 2.357143 |
eXistenZ (1999) | 3.098592 | 3.289086 |
3706 rows × 2 columns
mean_ratings[:5]
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
#先过滤掉少于250(完全随意定的数字)个评分的电影;
#为此,我接着按标题对数据进行分组,并使用size()为每个标题获取一个元素是各分组大小的Series
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
#评分多于250个的电影标题的索引之后可以用于从mean_ratings中选出所需的行
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings
gender | F | M |
---|---|---|
title | ||
'burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
... | ... | ... |
Young Guns (1988) | 3.371795 | 3.425620 |
Young Guns II (1990) | 2.934783 | 2.904025 |
Young Sherlock Holmes (1985) | 3.514706 | 3.363344 |
Zero Effect (1998) | 3.864407 | 3.723140 |
eXistenZ (1999) | 3.098592 | 3.289086 |
1216 rows × 2 columns
#要看到女性观众的top电影,我们可以按F列降序排序
top_female_ratings = mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
#一种方法是添加一列到含有均值差的mean_ratings中,然后按以下方式排序
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
mean_ratings[:10]
gender | F | M | diff |
---|---|---|---|
title | |||
'burbs, The (1989) | 2.793478 | 2.962085 | 0.168607 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 | -0.334586 |
101 Dalmatians (1961) | 3.791444 | 3.500000 | -0.291444 |
101 Dalmatians (1996) | 3.240000 | 2.911215 | -0.328785 |
12 Angry Men (1957) | 4.184397 | 4.328421 | 0.144024 |
13th Warrior, The (1999) | 3.112000 | 3.168000 | 0.056000 |
2 Days in the Valley (1996) | 3.488889 | 3.244813 | -0.244076 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 | 0.039102 |
2001: A Space Odyssey (1968) | 3.825581 | 4.129738 | 0.304156 |
2010 (1984) | 3.446809 | 3.413712 | -0.033097 |
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | -0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | -0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | -0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | -0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | -0.482573 |
#转换行的顺序,并切片出top 10的行,我们就可以获得男性更喜欢但女性评分不高的电影
sorted_by_diff[::-1][:10]
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | 0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | 0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | 0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | 0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | 0.544704 |
#假设你想要的是不依赖于性别标识而在观众中引起最大异议的电影。
#异议可以通过评分的方差或标准差来衡量
ratings_std_by_title = data.groupby('title')['rating'].std()
ratings_std_by_title[:10]
title
$1,000,000 Duck (1971) 1.092563
'Night Mother (1986) 1.118636
'Til There Was You (1997) 1.020159
'burbs, The (1989) 1.107760
...And Justice for All (1979) 0.878110
1-900 (1994) 0.707107
10 Things I Hate About You (1999) 0.989815
101 Dalmatians (1961) 0.982103
101 Dalmatians (1996) 1.098717
12 Angry Men (1957) 0.812731
Name: rating, dtype: float64
ratings_std_by_title = ratings_std_by_title.loc[active_titles]
ratings_std_by_title[:10]
title
'burbs, The (1989) 1.107760
10 Things I Hate About You (1999) 0.989815
101 Dalmatians (1961) 0.982103
101 Dalmatians (1996) 1.098717
12 Angry Men (1957) 0.812731
13th Warrior, The (1999) 1.140421
2 Days in the Valley (1996) 0.921592
20,000 Leagues Under the Sea (1954) 0.869685
2001: A Space Odyssey (1968) 1.042504
2010 (1984) 0.946618
Name: rating, dtype: float64
ratings_std_by_title.sort_values(ascending=False)[:10]
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64
names1880 = pd.read_table('datasets/babynames/yob1880.txt',sep=',',names=['name','sex','births'])
names1880.head()
name | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
#为简单起见,我们可以使用按性别列出的出生总和作为当年的出生总数
names1880.groupby('sex').births.sum()
sex
F 90993
M 110493
Name: births, dtype: int64
#由于数据集按年分为多个文件,首先要做的事情之一是将所有数据集中到一个DataFrame中,然后再添加一个年份字段。
#你可以使用pandas.concat来做到这一点
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
path = 'datasets/babynames/yob%d.txt' %year
frame = pd.read_csv(path,names=columns)
frame['year'] = year
pieces.append(frame)
#将所有内容粘贴进一个
names = pd.concat(pieces,ignore_index=True)
names.head()
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
3 | Elizabeth | F | 1939 | 1880 |
4 | Minnie | F | 1746 | 1880 |
#可以使用groupby或pivot_table开始聚合年份和性别的数据
total_births = names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
total_births.plot(title='Total births by sex and year')
def add_prop(group):
group['prop'] = group.births/group.births.sum()
return group
names = names.groupby(['year','sex']).apply(add_prop)
names
name | sex | births | year | group | prop | |
---|---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 | 0.019188 |
... | ... | ... | ... | ... | ... | ... |
1690779 | Zymaire | M | 5 | 2010 | 0.000003 | 0.000003 |
1690780 | Zyonne | M | 5 | 2010 | 0.000003 | 0.000003 |
1690781 | Zyquarius | M | 5 | 2010 | 0.000003 | 0.000003 |
1690782 | Zyran | M | 5 | 2010 | 0.000003 | 0.000003 |
1690783 | Zzyzx | M | 5 | 2010 | 0.000003 | 0.000003 |
1690784 rows × 6 columns
#在执行此类组操作时,进行完整性检查通常很有价值,例如验证所有组中的prop列总计为1
names.groupby(['year','sex']).prop.sum()
year sex
1880 F 1.0
M 1.0
1881 F 1.0
M 1.0
1882 F 1.0
...
2008 M 1.0
2009 F 1.0
M 1.0
2010 F 1.0
M 1.0
Name: prop, Length: 262, dtype: float64
#每个性别/年份组合的前1,000名
def get_top1000(group):
return group.sort_values(by='births',ascending=False)[:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
#删除组索引,不需要它
top1000.reset_index(inplace=True,drop=True)
#如果你喜欢DIY方式,可以试试以下代码
pieces = []
for year,group in names.groupby(['year','sex']):
pieces.append(group.sort_values(by='births',ascending=False)[:1000])
top1000 = pd.concat(pieces,ignore_index=True)
top1000
name | sex | births | year | group | prop | |
---|---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 | 0.019188 |
... | ... | ... | ... | ... | ... | ... |
261872 | Camilo | M | 194 | 2010 | 0.000102 | 0.000102 |
261873 | Destin | M | 194 | 2010 | 0.000102 | 0.000102 |
261874 | Jaquan | M | 194 | 2010 | 0.000102 | 0.000102 |
261875 | Jaydan | M | 194 | 2010 | 0.000102 | 0.000102 |
261876 | Maxton | M | 193 | 2010 | 0.000102 | 0.000102 |
261877 rows × 6 columns
boys = top1000[top1000.sex=='M']
girls = top1000[top1000.sex=='F']
total_births = top1000.pivot_table('births',index='year',columns='name',aggfunc=sum)
total_births
name | Aaden | Aaliyah | Aarav | Aaron | Aarush | Ab | Abagail | Abb | Abbey | Abbie | Abbigail | Abbott | Abby | Abdiel | Abdul | Abdullah | Abe | Abel | Abelardo | Abigail | Abigale | Abigayle | Abner | Abraham | Abram | Abril | Ace | Acie | Ada | Adah | Adalberto | Adaline | Adalyn | Adalynn | Adam | Adamaris | Adams | Adan | Adda | Addie | Addilyn | Addison | Addisyn | Addyson | Adela | Adelaide | Adelard | Adelbert | Adele | Adelia | Adelina | Adeline | Adell | Adella | Adelle | Adelyn | Adelynn | Aden | Adilene | Adin | ... | Zada | Zadie | Zaid | Zaida | Zaidee | Zaiden | Zain | Zaire | Zakary | Zana | Zander | Zandra | Zane | Zaniyah | Zara | Zaria | Zariah | Zariyah | Zavier | Zavion | Zayden | Zayne | Zeb | Zebulon | Zechariah | Zed | Zeke | Zela | Zelda | Zelia | Zella | Zelma | Zelpha | Zena | Zenas | Zenia | Zennie | Zeno | Zenobia | Zeta | Zetta | Zettie | Zhane | Zigmund | Zillah | Zilpah | Zilpha | Zina | Zion | Zita | Zoa | Zoe | Zoey | Zoie | Zola | Zollie | Zona | Zora | Zula | Zuri |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1880 | NaN | NaN | NaN | 102.0 | NaN | NaN | NaN | NaN | NaN | 71.0 | NaN | NaN | 6.0 | NaN | NaN | NaN | 50.0 | 9.0 | NaN | 12.0 | NaN | NaN | 27.0 | 81.0 | 21.0 | NaN | NaN | NaN | 652.0 | 24.0 | NaN | 23.0 | NaN | NaN | 104.0 | NaN | NaN | NaN | 14.0 | 282.0 | NaN | 19.0 | NaN | NaN | 9.0 | 65.0 | NaN | 28.0 | 41.0 | 18.0 | NaN | 54.0 | NaN | 26.0 | 5.0 | NaN | NaN | 7.0 | NaN | NaN | ... | 13.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | NaN | NaN | NaN | 6.0 | NaN | 6.0 | NaN | 31.0 | 19.0 | NaN | 7.0 | NaN | NaN | NaN | NaN | NaN | NaN | 8.0 | NaN | NaN | NaN | NaN | NaN | 6.0 | NaN | NaN | NaN | 8.0 | 23.0 | NaN | NaN | 7.0 | NaN | 8.0 | 28.0 | 27.0 | NaN |
1881 | NaN | NaN | NaN | 94.0 | NaN | NaN | NaN | NaN | NaN | 81.0 | NaN | NaN | 7.0 | NaN | NaN | NaN | 36.0 | 12.0 | NaN | 8.0 | NaN | NaN | 30.0 | 86.0 | 30.0 | NaN | NaN | 6.0 | 628.0 | 29.0 | NaN | 18.0 | NaN | NaN | 116.0 | NaN | NaN | NaN | 20.0 | 294.0 | NaN | 17.0 | NaN | NaN | 7.0 | 62.0 | NaN | 14.0 | 43.0 | 21.0 | NaN | 58.0 | 14.0 | 16.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 8.0 | 11.0 | NaN | 6.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 38.0 | 17.0 | NaN | 6.0 | NaN | NaN | NaN | NaN | 6.0 | NaN | 7.0 | NaN | NaN | NaN | 7.0 | 9.0 | 6.0 | NaN | NaN | NaN | NaN | 22.0 | NaN | NaN | 10.0 | NaN | 9.0 | 21.0 | 27.0 | NaN |
1882 | NaN | NaN | NaN | 85.0 | NaN | NaN | NaN | NaN | NaN | 80.0 | NaN | NaN | 11.0 | NaN | NaN | NaN | 50.0 | 10.0 | NaN | 14.0 | NaN | NaN | 32.0 | 91.0 | 25.0 | NaN | 8.0 | NaN | 689.0 | 27.0 | NaN | 16.0 | NaN | NaN | 114.0 | NaN | NaN | NaN | 17.0 | 347.0 | NaN | 21.0 | NaN | NaN | 17.0 | 74.0 | NaN | 14.0 | 64.0 | 23.0 | NaN | 70.0 | NaN | 18.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 9.0 | 7.0 | NaN | NaN | 5.0 | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | 6.0 | NaN | 50.0 | 21.0 | NaN | 6.0 | NaN | NaN | NaN | NaN | 7.0 | NaN | 7.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 8.0 | 25.0 | NaN | NaN | 9.0 | NaN | 17.0 | 32.0 | 21.0 | NaN |
1883 | NaN | NaN | NaN | 105.0 | NaN | NaN | NaN | NaN | NaN | 79.0 | NaN | NaN | NaN | NaN | NaN | NaN | 43.0 | 12.0 | NaN | 11.0 | NaN | NaN | 27.0 | 52.0 | 20.0 | NaN | 6.0 | NaN | 778.0 | 41.0 | NaN | 11.0 | NaN | NaN | 107.0 | NaN | NaN | NaN | 24.0 | 369.0 | NaN | 20.0 | NaN | NaN | 15.0 | 85.0 | NaN | 14.0 | 68.0 | 30.0 | NaN | 82.0 | NaN | 16.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 11.0 | 7.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | 6.0 | NaN | NaN | 5.0 | 55.0 | 16.0 | NaN | 13.0 | NaN | NaN | NaN | 6.0 | 5.0 | NaN | 15.0 | NaN | NaN | NaN | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 23.0 | NaN | NaN | 10.0 | NaN | 11.0 | 35.0 | 25.0 | NaN |
1884 | NaN | NaN | NaN | 97.0 | NaN | NaN | NaN | NaN | NaN | 98.0 | NaN | NaN | 6.0 | NaN | NaN | NaN | 45.0 | 14.0 | NaN | 13.0 | NaN | NaN | 33.0 | 67.0 | 29.0 | NaN | NaN | NaN | 854.0 | 33.0 | NaN | 20.0 | NaN | NaN | 83.0 | NaN | NaN | NaN | 18.0 | 364.0 | NaN | 17.0 | NaN | NaN | 11.0 | 98.0 | 7.0 | 17.0 | 71.0 | 37.0 | 7.0 | 112.0 | 9.0 | 16.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 11.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11.0 | NaN | NaN | NaN | NaN | NaN | 7.0 | NaN | 63.0 | 29.0 | NaN | 11.0 | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | 9.0 | NaN | NaN | NaN | NaN | 6.0 | 7.0 | NaN | 11.0 | 13.0 | 31.0 | NaN | NaN | 14.0 | 6.0 | 8.0 | 58.0 | 27.0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2006 | NaN | 3737.0 | NaN | 8279.0 | NaN | NaN | 297.0 | NaN | 404.0 | 440.0 | 630.0 | NaN | 1682.0 | NaN | NaN | 219.0 | NaN | 922.0 | NaN | 15615.0 | 297.0 | 351.0 | NaN | 2200.0 | 414.0 | 316.0 | 240.0 | NaN | 397.0 | NaN | NaN | NaN | NaN | NaN | 6775.0 | 286.0 | NaN | 1098.0 | NaN | NaN | NaN | 8054.0 | 470.0 | 872.0 | NaN | 285.0 | NaN | NaN | NaN | NaN | NaN | 676.0 | NaN | NaN | NaN | NaN | NaN | 1401.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 228.0 | 247.0 | 221.0 | NaN | 1079.0 | NaN | 1409.0 | NaN | 312.0 | 393.0 | 349.0 | NaN | 248.0 | NaN | 224.0 | 196.0 | NaN | NaN | 336.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1635.0 | NaN | NaN | 5145.0 | 2839.0 | 530.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2007 | NaN | 3941.0 | NaN | 8914.0 | NaN | NaN | 313.0 | NaN | 349.0 | 468.0 | 651.0 | NaN | 1573.0 | NaN | NaN | 224.0 | NaN | 939.0 | NaN | 15447.0 | 285.0 | 314.0 | NaN | 2139.0 | 463.0 | 736.0 | 279.0 | NaN | 460.0 | NaN | NaN | NaN | 316.0 | NaN | 6770.0 | 285.0 | NaN | 1080.0 | NaN | NaN | NaN | 12281.0 | 491.0 | 1380.0 | NaN | 409.0 | NaN | NaN | NaN | NaN | NaN | 839.0 | NaN | NaN | NaN | 335.0 | NaN | 1311.0 | NaN | 197.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 238.0 | 267.0 | NaN | NaN | 1052.0 | NaN | 1595.0 | 291.0 | 407.0 | 414.0 | 494.0 | NaN | 255.0 | NaN | 429.0 | 201.0 | NaN | NaN | 362.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2069.0 | NaN | NaN | 4925.0 | 3028.0 | 526.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2008 | 955.0 | 4028.0 | 219.0 | 8511.0 | NaN | NaN | 317.0 | NaN | 344.0 | 400.0 | 608.0 | NaN | 1328.0 | 199.0 | NaN | 210.0 | NaN | 863.0 | NaN | 15045.0 | NaN | 288.0 | NaN | 2143.0 | 477.0 | 585.0 | 322.0 | NaN | 520.0 | NaN | NaN | NaN | 576.0 | 328.0 | 6074.0 | NaN | NaN | 1110.0 | NaN | NaN | NaN | 11008.0 | 553.0 | 1428.0 | NaN | 555.0 | NaN | NaN | NaN | NaN | NaN | 910.0 | NaN | NaN | NaN | 527.0 | NaN | 1382.0 | NaN | NaN | ... | NaN | NaN | 219.0 | NaN | NaN | 231.0 | 273.0 | 255.0 | NaN | NaN | 1115.0 | NaN | 1568.0 | 316.0 | 376.0 | 442.0 | 535.0 | NaN | 304.0 | NaN | 563.0 | 267.0 | NaN | NaN | 365.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2027.0 | NaN | NaN | 4764.0 | 3438.0 | 492.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2009 | 1265.0 | 4352.0 | 270.0 | 7936.0 | NaN | NaN | 296.0 | NaN | 307.0 | 369.0 | 675.0 | NaN | 1274.0 | 229.0 | NaN | 256.0 | NaN | 960.0 | NaN | 14342.0 | 271.0 | NaN | NaN | 2088.0 | 554.0 | 477.0 | 418.0 | NaN | 531.0 | NaN | NaN | NaN | 861.0 | 433.0 | 5649.0 | NaN | NaN | 1122.0 | NaN | NaN | NaN | 10883.0 | 730.0 | 1451.0 | NaN | 534.0 | NaN | NaN | NaN | NaN | NaN | 919.0 | NaN | NaN | NaN | 777.0 | 331.0 | 1363.0 | NaN | NaN | ... | NaN | NaN | 199.0 | NaN | NaN | 297.0 | 295.0 | 237.0 | NaN | NaN | 1140.0 | NaN | 1511.0 | 391.0 | 364.0 | 357.0 | 602.0 | NaN | 245.0 | 199.0 | 744.0 | 295.0 | NaN | NaN | 339.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1860.0 | NaN | NaN | 5120.0 | 3981.0 | 496.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2010 | 448.0 | 4628.0 | 438.0 | 7374.0 | 226.0 | NaN | 277.0 | NaN | 295.0 | 324.0 | 585.0 | NaN | 1140.0 | 264.0 | NaN | 225.0 | NaN | 1119.0 | NaN | 14124.0 | 282.0 | NaN | NaN | 1899.0 | 483.0 | 395.0 | 395.0 | NaN | 525.0 | NaN | NaN | NaN | 1261.0 | 686.0 | 5062.0 | NaN | NaN | 937.0 | NaN | NaN | 260.0 | 10253.0 | 793.0 | 1605.0 | NaN | 705.0 | NaN | NaN | 285.0 | NaN | 281.0 | 983.0 | NaN | NaN | NaN | 825.0 | 458.0 | 1162.0 | NaN | NaN | ... | NaN | NaN | 209.0 | NaN | NaN | 397.0 | 278.0 | 222.0 | NaN | NaN | 1106.0 | NaN | 1445.0 | 370.0 | 390.0 | 323.0 | 608.0 | 304.0 | 309.0 | NaN | 919.0 | 318.0 | NaN | NaN | 358.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1926.0 | NaN | NaN | 6200.0 | 5164.0 | 504.0 | NaN | NaN | NaN | NaN | NaN | 258.0 |
131 rows × 6868 columns
total_births.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB
subset = total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of births per year')
table = top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0,1.2,13),
xticks=range(1880,2020,10))
#另一个有趣的指标是不同名字的数量,按最高到最低的受欢迎程度在出生人数最高的50%的名字中排序
df = boys[boys.year==2010]
df
name | sex | births | year | group | prop | |
---|---|---|---|---|---|---|
260877 | Jacob | M | 21875 | 2010 | 0.011523 | 0.011523 |
260878 | Ethan | M | 17866 | 2010 | 0.009411 | 0.009411 |
260879 | Michael | M | 17133 | 2010 | 0.009025 | 0.009025 |
260880 | Jayden | M | 17030 | 2010 | 0.008971 | 0.008971 |
260881 | William | M | 16870 | 2010 | 0.008887 | 0.008887 |
... | ... | ... | ... | ... | ... | ... |
261872 | Camilo | M | 194 | 2010 | 0.000102 | 0.000102 |
261873 | Destin | M | 194 | 2010 | 0.000102 | 0.000102 |
261874 | Jaquan | M | 194 | 2010 | 0.000102 | 0.000102 |
261875 | Jaydan | M | 194 | 2010 | 0.000102 | 0.000102 |
261876 | Maxton | M | 193 | 2010 | 0.000102 | 0.000102 |
1000 rows × 6 columns
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
260877 0.011523
260878 0.020934
260879 0.029959
260880 0.038930
260881 0.047817
260882 0.056579
260883 0.065155
260884 0.073414
260885 0.081528
260886 0.089621
Name: prop, dtype: float64
prop_cumsum.values.searchsorted(0.5)
116
df = boys[boys.year==1900]
df
name | sex | births | year | group | prop | |
---|---|---|---|---|---|---|
40877 | John | M | 9834 | 1900 | 0.065319 | 0.065319 |
40878 | William | M | 8580 | 1900 | 0.056990 | 0.056990 |
40879 | James | M | 7246 | 1900 | 0.048129 | 0.048129 |
40880 | George | M | 5405 | 1900 | 0.035901 | 0.035901 |
40881 | Charles | M | 4102 | 1900 | 0.027246 | 0.027246 |
... | ... | ... | ... | ... | ... | ... |
41872 | Theron | M | 8 | 1900 | 0.000053 | 0.000053 |
41873 | Terrell | M | 8 | 1900 | 0.000053 | 0.000053 |
41874 | Solon | M | 8 | 1900 | 0.000053 | 0.000053 |
41875 | Rayfield | M | 8 | 1900 | 0.000053 | 0.000053 |
41876 | Sinclair | M | 8 | 1900 | 0.000053 | 0.000053 |
1000 rows × 6 columns
in1900 = df.sort_values(by='prop',ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5)+1
25
#你现在可以将此操作应用于每个年/性别分组,通过这些字段进行groupby,
#并将返回值是每个分组计数值的函数apply到每个分组上
def get_quantile_count(group,q=0.5):
group = group.sort_values(by='prop',ascending=False)
return group.prop.cumsum().values.searchsorted(q)+1
diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
#导入matplotlib相关库
import matplotlib.pyplot as plt
# 中文乱码的处理
plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
diversity.plot(title='按年份划分的多样性指标图')
#从name列提取最后一个字母
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
table[:5]
sex | F | ... | M | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | 1880 | 1881 | 1882 | 1883 | 1884 | 1885 | 1886 | 1887 | 1888 | 1889 | 1890 | 1891 | 1892 | 1893 | 1894 | 1895 | 1896 | 1897 | 1898 | 1899 | 1900 | 1901 | 1902 | 1903 | 1904 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 | 1912 | 1913 | 1914 | 1915 | 1916 | 1917 | 1918 | 1919 | 1920 | 1921 | 1922 | 1923 | 1924 | 1925 | 1926 | 1927 | 1928 | 1929 | 1930 | 1931 | 1932 | 1933 | 1934 | 1935 | 1936 | 1937 | 1938 | 1939 | ... | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | 1960 | 1961 | 1962 | 1963 | 1964 | 1965 | 1966 | 1967 | 1968 | 1969 | 1970 | 1971 | 1972 | 1973 | 1974 | 1975 | 1976 | 1977 | 1978 | 1979 | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | 1987 | 1988 | 1989 | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 |
last_letter | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
a | 31446.0 | 31581.0 | 36536.0 | 38330.0 | 43680.0 | 45408.0 | 49100.0 | 48942.0 | 59442.0 | 58631.0 | 62313.0 | 60582.0 | 68331.0 | 67821.0 | 70631.0 | 73002.0 | 73584.0 | 72148.0 | 79150.0 | 70712.0 | 89934.0 | 72186.0 | 77816.0 | 77130.0 | 80201.0 | 84080.0 | 83755.0 | 90326.0 | 93769.0 | 96160.0 | 108376.0 | 113117.0 | 149133.0 | 166038.0 | 199759.0 | 257348.0 | 272192.0 | 279747.0 | 297646.0 | 288607.0 | 302392.0 | 311731.0 | 303382.0 | 302884.0 | 315612.0 | 309535.0 | 301188.0 | 304129.0 | 295654.0 | 284469.0 | 288291.0 | 274399.0 | 278899.0 | 264132.0 | 273476.0 | 273152.0 | 273131.0 | 291960.0 | 308275.0 | 310346.0 | ... | 4205.0 | 4267.0 | 4524.0 | 4665.0 | 4744.0 | 4936.0 | 5011.0 | 4877.0 | 5223.0 | 5204.0 | 5254.0 | 5328.0 | 5182.0 | 4820.0 | 4754.0 | 4622.0 | 4668.0 | 4833.0 | 5848.0 | 7016.0 | 8891.0 | 10279.0 | 13321.0 | 17716.0 | 21073.0 | 22550.0 | 28670.0 | 31439.0 | 37482.0 | 42396.0 | 45465.0 | 44614.0 | 42915.0 | 46549.0 | 49105.0 | 44776.0 | 47522.0 | 50337.0 | 54098.0 | 52158.0 | 50977.0 | 47271.0 | 45592.0 | 44441.0 | 44991.0 | 42739.0 | 41458.0 | 41281.0 | 40608.0 | 40837.0 | 39124.0 | 38815.0 | 37825.0 | 38650.0 | 36838.0 | 36156.0 | 34654.0 | 32901.0 | 31430.0 | 28438.0 |
b | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 | NaN | 5.0 | NaN | NaN | NaN | 11.0 | 10.0 | 6.0 | 5.0 | 15.0 | NaN | 11.0 | 12.0 | 6.0 | 9.0 | 14.0 | 11.0 | 11.0 | 20.0 | 10.0 | 21.0 | 12.0 | 8.0 | 10.0 | 8.0 | 19.0 | 13.0 | 8.0 | ... | 1500.0 | 1512.0 | 1536.0 | 1545.0 | 1727.0 | 1980.0 | 2993.0 | 3686.0 | 4060.0 | 3912.0 | 3739.0 | 3454.0 | 3192.0 | 2817.0 | 2208.0 | 1732.0 | 1507.0 | 1746.0 | 1905.0 | 2114.0 | 2035.0 | 2535.0 | 2915.0 | 3835.0 | 4620.0 | 5581.0 | 7091.0 | 7486.0 | 9049.0 | 10139.0 | 11428.0 | 12288.0 | 13394.0 | 13602.0 | 14998.0 | 16479.0 | 17731.0 | 19708.0 | 23123.0 | 27942.0 | 32179.0 | 32837.0 | 35817.0 | 38226.0 | 40717.0 | 42791.0 | 46177.0 | 50330.0 | 50051.0 | 50892.0 | 50950.0 | 49284.0 | 48065.0 | 45914.0 | 43144.0 | 42600.0 | 42123.0 | 39945.0 | 38862.0 | 38859.0 |
c | NaN | NaN | 5.0 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | NaN | 7.0 | 8.0 | 5.0 | NaN | 15.0 | 7.0 | 17.0 | 13.0 | 8.0 | 18.0 | 12.0 | 13.0 | 17.0 | 7.0 | 10.0 | NaN | 14.0 | 5.0 | NaN | 6.0 | NaN | 5.0 | 8.0 | 10.0 | NaN | 5.0 | ... | 7408.0 | 7870.0 | 7953.0 | 8661.0 | 9430.0 | 9848.0 | 11368.0 | 12924.0 | 14629.0 | 15476.0 | 16373.0 | 17873.0 | 18783.0 | 20475.0 | 22643.0 | 22036.0 | 23520.0 | 24688.0 | 28242.0 | 32113.0 | 30950.0 | 29200.0 | 27973.0 | 27234.0 | 26708.0 | 28307.0 | 28289.0 | 27021.0 | 27798.0 | 30383.0 | 28039.0 | 27365.0 | 27625.0 | 28586.0 | 28552.0 | 27177.0 | 27963.0 | 28099.0 | 27549.0 | 28951.0 | 28259.0 | 27252.0 | 26423.0 | 26912.0 | 26330.0 | 26270.0 | 25848.0 | 26624.0 | 26160.0 | 26998.0 | 27113.0 | 27238.0 | 27697.0 | 26778.0 | 26078.0 | 26635.0 | 26864.0 | 25318.0 | 24048.0 | 23125.0 |
d | 609.0 | 607.0 | 734.0 | 810.0 | 916.0 | 862.0 | 1007.0 | 1027.0 | 1298.0 | 1374.0 | 1438.0 | 1512.0 | 1775.0 | 1821.0 | 1985.0 | 2268.0 | 2372.0 | 2455.0 | 2953.0 | 3028.0 | 3670.0 | 3146.0 | 3499.0 | 3844.0 | 4260.0 | 4591.0 | 4722.0 | 5110.0 | 5457.0 | 5929.0 | 6750.0 | 7509.0 | 10518.0 | 11907.0 | 14008.0 | 18440.0 | 19038.0 | 19940.0 | 21092.0 | 20949.0 | 21807.0 | 21711.0 | 20177.0 | 19623.0 | 19338.0 | 17954.0 | 17085.0 | 16522.0 | 14958.0 | 13361.0 | 12124.0 | 10670.0 | 9916.0 | 8698.0 | 8164.0 | 7490.0 | 6915.0 | 6554.0 | 6317.0 | 6109.0 | ... | 273057.0 | 283164.0 | 285612.0 | 289767.0 | 287895.0 | 285524.0 | 283833.0 | 272473.0 | 266287.0 | 262112.0 | 257912.0 | 249965.0 | 247275.0 | 240636.0 | 219135.0 | 209407.0 | 206098.0 | 199972.0 | 197929.0 | 196568.0 | 178707.0 | 157053.0 | 142518.0 | 139114.0 | 133541.0 | 128164.0 | 130341.0 | 126353.0 | 129637.0 | 129375.0 | 124490.0 | 122069.0 | 115659.0 | 112215.0 | 110894.0 | 108526.0 | 107076.0 | 105127.0 | 105727.0 | 101968.0 | 93858.0 | 87586.0 | 82541.0 | 77163.0 | 72313.0 | 70157.0 | 69036.0 | 67683.0 | 65507.0 | 64251.0 | 60838.0 | 55829.0 | 53391.0 | 51754.0 | 50670.0 | 51410.0 | 50595.0 | 47910.0 | 46172.0 | 44398.0 |
e | 33378.0 | 34080.0 | 40399.0 | 41914.0 | 48089.0 | 49616.0 | 53884.0 | 54353.0 | 66750.0 | 66663.0 | 70948.0 | 67750.0 | 77186.0 | 76455.0 | 79938.0 | 83875.0 | 84355.0 | 82783.0 | 91151.0 | 81395.0 | 107080.0 | 83223.0 | 92643.0 | 90666.0 | 94631.0 | 100724.0 | 101128.0 | 108114.0 | 112484.0 | 116731.0 | 133569.0 | 136484.0 | 180466.0 | 199255.0 | 242133.0 | 307668.0 | 324955.0 | 335945.0 | 357952.0 | 351089.0 | 364800.0 | 372710.0 | 362228.0 | 358140.0 | 365030.0 | 355336.0 | 342013.0 | 338373.0 | 321941.0 | 307686.0 | 305386.0 | 288003.0 | 286406.0 | 270029.0 | 275930.0 | 270914.0 | 270141.0 | 273854.0 | 280252.0 | 271897.0 | ... | 170371.0 | 171645.0 | 170356.0 | 173053.0 | 171361.0 | 175848.0 | 183280.0 | 182223.0 | 183636.0 | 178823.0 | 173033.0 | 164949.0 | 158311.0 | 150163.0 | 133372.0 | 125957.0 | 119826.0 | 117229.0 | 120870.0 | 127310.0 | 121332.0 | 112327.0 | 106959.0 | 105232.0 | 104515.0 | 105420.0 | 105290.0 | 103935.0 | 108840.0 | 112343.0 | 112976.0 | 114190.0 | 114382.0 | 113981.0 | 122448.0 | 125673.0 | 126461.0 | 130176.0 | 139160.0 | 146489.0 | 146218.0 | 149738.0 | 147895.0 | 145682.0 | 140838.0 | 142438.0 | 141857.0 | 144854.0 | 145047.0 | 148821.0 | 145395.0 | 144651.0 | 144769.0 | 142098.0 | 141123.0 | 142999.0 | 143698.0 | 140966.0 | 135496.0 | 129012.0 |
5 rows × 262 columns
subtable = table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 108376.0 | 691247.0 | 670605.0 | 977.0 | 5204.0 | 28438.0 |
b | NaN | 694.0 | 450.0 | 411.0 | 3912.0 | 38859.0 |
c | 5.0 | 49.0 | 946.0 | 482.0 | 15476.0 | 23125.0 |
d | 6750.0 | 3729.0 | 2607.0 | 22111.0 | 262112.0 | 44398.0 |
e | 133569.0 | 435013.0 | 313833.0 | 28655.0 | 178823.0 | 129012.0 |
#按照出生总数对表格进行归一化处理,计算一个新表格,其中包含每个性别的每个结束字母占总出生数的比例
subtable.sum()
sex year
F 1910 396416.0
1960 2022062.0
2010 1759010.0
M 1910 194198.0
1960 2132588.0
2010 1898382.0
dtype: float64
letter_prop = subtable/subtable.sum()
letter_prop[:10]
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 0.273390 | 0.341853 | 0.381240 | 0.005031 | 0.002440 | 0.014980 |
b | NaN | 0.000343 | 0.000256 | 0.002116 | 0.001834 | 0.020470 |
c | 0.000013 | 0.000024 | 0.000538 | 0.002482 | 0.007257 | 0.012181 |
d | 0.017028 | 0.001844 | 0.001482 | 0.113858 | 0.122908 | 0.023387 |
e | 0.336941 | 0.215133 | 0.178415 | 0.147556 | 0.083853 | 0.067959 |
f | NaN | 0.000010 | 0.000055 | 0.000783 | 0.004325 | 0.001188 |
g | 0.000144 | 0.000157 | 0.000374 | 0.002250 | 0.009488 | 0.001404 |
h | 0.051529 | 0.036224 | 0.075852 | 0.045562 | 0.037907 | 0.051670 |
i | 0.001526 | 0.039965 | 0.031734 | 0.000844 | 0.000603 | 0.022628 |
j | NaN | NaN | 0.000090 | NaN | NaN | 0.000769 |
#现在根据掌握的字母比例,我们可以绘制出按年划分的每个性别的条形图
import matplotlib.pyplot as plt
fig,axes = plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female')
letter_prop = table/table.sum()
dny_ts = letter_prop.loc[['d','n','y'],'M'].T
dny_ts.head()
last_letter | d | n | y |
---|---|---|---|
year | |||
1880 | 0.083055 | 0.153213 | 0.075760 |
1881 | 0.083247 | 0.153214 | 0.077451 |
1882 | 0.085340 | 0.149560 | 0.077537 |
1883 | 0.084066 | 0.151646 | 0.079144 |
1884 | 0.086120 | 0.149915 | 0.080405 |
dny_ts.plot(title = '随着时间推移名字以d/n/y结尾的男孩的比例变化趋势')
all_names = pd.Series(top1000.name.unique())
all_names[:10]
0 Mary
1 Anna
2 Emma
3 Elizabeth
4 Minnie
5 Margaret
6 Ida
7 Alice
8 Bertha
9 Sarah
dtype: object
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like
632 Leslie
2294 Lesley
4262 Leslee
4728 Lesli
6103 Lesly
dtype: object
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
name
Leslee 1082
Lesley 35022
Lesli 929
Leslie 370429
Lesly 10067
Name: births, dtype: int64
#让我们按性别和年份进行聚合,并在年内进行标准化
table = filtered.pivot_table('births',index='year',columns='sex',aggfunc='sum')
table = table.div(table.sum(1),axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1.0 | NaN |
2007 | 1.0 | NaN |
2008 | 1.0 | NaN |
2009 | 1.0 | NaN |
2010 | 1.0 | NaN |
table.plot(style={'M':'k-','F':'k--'})
import json
db = json.load(open('datasets/usda_food/database.json'))
len(db)
6636
#db中的每个条目都是一个包含单个食物所有数据的词典。
#'nutrients’字段是一个字典的列表,每个营养元素对应一个字典
db[0].keys()
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
db[0]['nutrients'][0]
{'value': 25.18,
'units': 'g',
'description': 'Protein',
'group': 'Composition'}
nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients[:7]
value | units | description | group | |
---|---|---|---|---|
0 | 25.18 | g | Protein | Composition |
1 | 29.20 | g | Total lipid (fat) | Composition |
2 | 3.06 | g | Carbohydrate, by difference | Composition |
3 | 3.28 | g | Ash | Other |
4 | 376.00 | kcal | Energy | Energy |
5 | 39.28 | g | Water | Composition |
6 | 1573.00 | kJ | Energy | Energy |
#将字典的列表转换为DataFrame时,
#我们可以指定一个需要提取的字段列表。我们将提取食物名称、分类、ID和制造商
info_keys = ['description','group','id','manufacturer']
info = pd.DataFrame(db,columns=info_keys)
info[-10:]
description | group | id | manufacturer | |
---|---|---|---|---|
6626 | CAMPBELL Soup Company, V8 Vegetable Juice, Ess... | Vegetables and Vegetable Products | 31010 | Campbell Soup Co. |
6627 | CAMPBELL Soup Company, V8 Vegetable Juice, Spi... | Vegetables and Vegetable Products | 31013 | Campbell Soup Co. |
6628 | CAMPBELL Soup Company, PACE, Jalapenos Nacho S... | Vegetables and Vegetable Products | 31014 | Campbell Soup Co. |
6629 | CAMPBELL Soup Company, V8 60% Vegetable Juice,... | Vegetables and Vegetable Products | 31016 | Campbell Soup Co. |
6630 | CAMPBELL Soup Company, V8 Vegetable Juice, Low... | Vegetables and Vegetable Products | 31017 | Campbell Soup Co. |
6631 | Bologna, beef, low fat | Sausages and Luncheon Meats | 42161 | |
6632 | Turkey and pork sausage, fresh, bulk, patty or... | Sausages and Luncheon Meats | 42173 | |
6633 | Babyfood, juice, pear | Baby Foods | 43408 | None |
6634 | Babyfood, dessert, banana yogurt, strained | Baby Foods | 43539 | None |
6635 | Babyfood, banana no tapioca, strained | Baby Foods | 43546 | None |
info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 description 6636 non-null object
1 group 6636 non-null object
2 id 6636 non-null int64
3 manufacturer 5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB
#可以通过value_counts查看食物组的分布情况
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products 812
Beef Products 618
Baked Products 496
Breakfast Cereals 403
Legumes and Legume Products 365
Fast Foods 365
Lamb, Veal, and Game Products 345
Sweets 341
Fruits and Fruit Juices 328
Pork Products 328
Name: group, dtype: int64
fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')
fec.info()
D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name, <class 'pandas.core.frame.DataFrame'> RangeIndex: 1001731 entries, 0 to 1001730 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cmte_id 1001731 non-null object 1 cand_id 1001731 non-null object 2 cand_nm 1001731 non-null object 3 contbr_nm 1001731 non-null object 4 contbr_city 1001712 non-null object 5 contbr_st 1001727 non-null object 6 contbr_zip 1001620 non-null object 7 contbr_employer 988002 non-null object 8 contbr_occupation 993301 non-null object 9 contb_receipt_amt 1001731 non-null float64 10 contb_receipt_dt 1001731 non-null object 11 receipt_desc 14166 non-null object 12 memo_cd 92482 non-null object 13 memo_text 97770 non-null object 14 form_tp 1001731 non-null object 15 file_num 1001731 non-null int64 dtypes: float64(1), int64(1), object(14) memory usage: 122.3+ MB
fec.iloc[123456]
cmte_id C00431445 cand_id P80003338 cand_nm Obama, Barack contbr_nm ELLMAN, IRA contbr_city TEMPE contbr_st AZ contbr_zip 852816719 contbr_employer ARIZONA STATE UNIVERSITY contbr_occupation PROFESSOR contb_receipt_amt 50.0 contb_receipt_dt 01-DEC-11 receipt_desc NaN memo_cd NaN memo_text NaN form_tp SA17A file_num 772372 Name: 123456, dtype: object
#可以使用unique获得所有不同的政治候选人名单
unique_cands = fec.cand_nm.unique()
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[2]
'Obama, Barack'
parties = {'Bachmann, Michelle':'Republican',
'Romney, Mitt':'Republican',
'Obama, Barack':'Democrat',
"Roemer, Charles E. 'Buddy' III":'Republican',
'Pawlenty, Timothy':'Republican',
'Johnson, Gary Earl':'Republican',
'Paul, Ron':'Republican',
'Santorum, Rick':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'McCotter, Thaddeus G':'Republican',
'Huntsman, Jon':'Republican',
'Perry, Rick':'Republican'}
fec.cand_nm[123456:123461]
123456 Obama, Barack
123457 Obama, Barack
123458 Obama, Barack
123459 Obama, Barack
123460 Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(parties)
123456 Democrat
123457 Democrat
123458 Democrat
123459 Democrat
123460 Democrat
Name: cand_nm, dtype: object
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
#首先,这些数据既包括捐款也包括退款(即负贡献金额)
(fec.contb_receipt_amt>0).value_counts()
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt>0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec.contbr_occupation.value_counts()[:10]
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
Name: contbr_occupation, dtype: int64
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED',
'C.E.O':'CEO'}
#如果没有映射,则返回x
f = lambda x :occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED'
}
#如果没有映射,则返回x
f = lambda x :occ_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',
columns='party',aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
over_2mm
party | Democrat | Republican |
---|---|---|
contbr_occupation | ||
ATTORNEY | 11141982.97 | 7.477194e+06 |
C.E.O. | 1690.00 | 2.592983e+06 |
CEO | 2074284.79 | 1.640758e+06 |
CONSULTANT | 2459912.71 | 2.544725e+06 |
ENGINEER | 951525.55 | 1.818374e+06 |
EXECUTIVE | 1355161.05 | 4.138850e+06 |
HOMEMAKER | 4248875.80 | 1.363428e+07 |
INVESTOR | 884133.00 | 2.431769e+06 |
LAWYER | 3160478.87 | 3.912243e+05 |
MANAGER | 762883.22 | 1.444532e+06 |
NOT PROVIDED | 4866973.96 | 2.023715e+07 |
OWNER | 1001567.36 | 2.408287e+06 |
PHYSICIAN | 3735124.94 | 3.594320e+06 |
PRESIDENT | 1878509.95 | 4.720924e+06 |
PROFESSOR | 2165071.08 | 2.967027e+05 |
REAL ESTATE | 528902.09 | 1.625902e+06 |
RETIRED | 25305116.38 | 2.356124e+07 |
SELF-EMPLOYED | 672393.40 | 1.640253e+06 |
over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm contbr_occupation Obama, Barack RETIRED 25305116.38 ATTORNEY 11141982.97 INFORMATION REQUESTED 4866973.96 HOMEMAKER 4248875.80 PHYSICIAN 3735124.94 LAWYER 3160478.87 CONSULTANT 2459912.71 Romney, Mitt RETIRED 11508473.59 INFORMATION REQUESTED PER BEST EFFORTS 11396894.84 HOMEMAKER 8147446.22 ATTORNEY 5364718.82 PRESIDENT 2491244.89 EXECUTIVE 2300947.03 C.E.O. 1968386.11 Name: contb_receipt_amt, dtype: float64
bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 493 | 77 |
(1, 10] | 40070 | 3681 |
(10, 100] | 372280 | 31853 |
(100, 1000] | 153991 | 43357 |
(1000, 10000] | 22284 | 26186 |
(10000, 100000] | 2 | 1 |
(100000, 1000000] | 3 | 0 |
(1000000, 10000000] | 4 | 0 |
bucket_sum = grouped.contb_receipt_amt.sum().unstack(0)
normed_sums = bucket_sum.div(bucket_sum.sum(axis=1),axis=0)
normed_sums
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 0.805182 | 0.194818 |
(1, 10] | 0.918767 | 0.081233 |
(10, 100] | 0.910769 | 0.089231 |
(100, 1000] | 0.710176 | 0.289824 |
(1000, 10000] | 0.447326 | 0.552674 |
(10000, 100000] | 0.823120 | 0.176880 |
(100000, 1000000] | 1.000000 | 0.000000 |
(1000000, 10000000] | 1.000000 | 0.000000 |
normed_sums[:-2].plot(kind='barh')
#将数据按照候选人和州进行聚合是一项常规分析
grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals[totals.sum(1)>100000]
totals[:10]
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AA | 56405.00 | 135.00 |
AB | 2048.00 | 0.00 |
AE | 42973.75 | 5680.00 |
AK | 281840.15 | 86204.24 |
AL | 543123.48 | 527303.51 |
AP | 37130.50 | 1655.00 |
AR | 359247.28 | 105556.00 |
AS | 2955.00 | 0.00 |
AZ | 1506476.98 | 1888436.23 |
CA | 23824984.24 | 11237636.60 |
percent = totals.div(totals.sum(1),axis=0)
percent[:10]
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AA | 0.997612 | 0.002388 |
AB | 1.000000 | 0.000000 |
AE | 0.883257 | 0.116743 |
AK | 0.765778 | 0.234222 |
AL | 0.507390 | 0.492610 |
AP | 0.957329 | 0.042671 |
AR | 0.772902 | 0.227098 |
AS | 1.000000 | 0.000000 |
AZ | 0.443745 | 0.556255 |
CA | 0.679498 | 0.320502 |
np.ones((10,5)).shape
(10, 5)
#一个典型的(C阶)3×4×5 float64值(8字节)的数组具有跨度(160,40,8)
np.ones((3,4,5),dtype=np.float64).strides
(160, 40, 8)
ints = np.ones(10,dtype=np.uint16)
ints
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint16)
floats = np.ones(10,dtype=np.float32)
floats
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)
np.issubdtype(ints.dtype,np.integer)
True
np.issubdtype(floats.dtype,np.floating)
True
#可以通过调用类型的mro方法来查看特定dtype的所有父类
np.float64.mro()
[numpy.float64,
numpy.floating,
numpy.inexact,
numpy.number,
numpy.generic,
float,
object]
np.issubdtype(ints.dtype,np.number)
True
arr = np.arange(8)
arr
array([0, 1, 2, 3, 4, 5, 6, 7])
#在很多情况下,你将数组从一个形状转换为另一个形状,并且不复制任何数据
arr.reshape(4,2)
array([[0, 1],
[2, 3],
[4, 5],
[6, 7]])
#多维数组也可以被重塑
arr.reshape(4,2).reshape(2,4)
array([[0, 1, 2, 3],
[4, 5, 6, 7]])
#传递的形状维度可以有一个值是-1,表示维度通过数据进行推断
arr = np.arange(15)
arr.reshape((5,-1))
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
#由于数组的shape属性是一个元组,它也可以被传递给reshape
other_arr = np.ones((3,5))
other_arr.shape
(3, 5)
arr.reshape(other_arr.shape)
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
#reshape的反操作可以将更高维度的数组转换为一维数组,
#这种操作通常被成为扁平化(flattening)或分散化(raveling)
arr = np.arange(15).reshape((5,3))
arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
arr.ravel()
#如果结果中的值在原始数组中是连续的,则ravel不会生成底层数值的副本。
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr.flatten()
#flatten方法的行为类似于ravel,但它总是返回数据的副本
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr = np.arange(12).reshape((3,4))
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
arr.ravel()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
arr.ravel('F')
array([ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11])
arr1 = np.array([[1,2,3],[4,5,6]])
arr1
array([[1, 2, 3],
[4, 5, 6]])
arr2 = np.array([[7,8,9],[10,11,12]])
arr2
array([[ 7, 8, 9],
[10, 11, 12]])
np.concatenate([arr1,arr2],axis=0)
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
np.concatenate([arr1,arr2],axis=1)
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
np.vstack((arr1,arr2))
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
np.hstack((arr1,arr2))
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
arr = np.random.randn(5,2)
arr
array([[-0.37933271, -1.04852791],
[-0.3278915 , 1.11594819],
[ 0.77077511, -1.19903381],
[ 0.38477425, -0.35244269],
[ 1.38135852, -0.10439573]])
#split可以将一个数组沿轴向切片成多个数组
#传递给np.split的值[1,3]表示将数组拆分时的索引位置
first,second,third = np.split(arr,[2,3])
first
array([[-0.37933271, -1.04852791],
[-0.3278915 , 1.11594819]])
second
array([[ 0.77077511, -1.19903381]])
third
array([[ 0.38477425, -0.35244269],
[ 1.38135852, -0.10439573]])
函数 | 描述 |
---|---|
concatenate | 最通用的函数,沿一个轴向连接数组的集合 |
vstack, row_ _stack | 按行堆叠数组(沿着轴0) |
hstack | 按列堆叠数组(沿着轴1) |
column_ stack | 类似于hstack,但会首先把1维数组转换为2维列向量 |
dstack | 按“深度”堆叠数组(沿着轴2) |
split | 沿着指定的轴,在传递的位置上分隔数组 |
hsplit/vsplit | 分别是沿着轴0和轴1进行分隔的方便函数 |
arr = np.arange(6)
arr
array([0, 1, 2, 3, 4, 5])
arr1= arr.reshape((3,2))
arr1
array([[0, 1],
[2, 3],
[4, 5]])
arr2 = np.random.randn(3,2)
arr2
array([[-2.17693174, 1.20516725],
[-0.44083574, 0.84645799],
[ 0.02369097, 0.63556261]])
np.r_[arr1,arr2]
array([[ 0. , 1. ],
[ 2. , 3. ],
[ 4. , 5. ],
[-2.17693174, 1.20516725],
[-0.44083574, 0.84645799],
[ 0.02369097, 0.63556261]])
np.c_[arr1,arr2]
array([[ 0. , 1. , -2.17693174, 1.20516725],
[ 2. , 3. , -0.44083574, 0.84645799],
[ 4. , 5. , 0.02369097, 0.63556261]])
np.c_[np.r_[arr1,arr2],arr]
array([[ 0. , 1. , 0. ],
[ 2. , 3. , 1. ],
[ 4. , 5. , 2. ],
[-2.17693174, 1.20516725, 3. ],
[-0.44083574, 0.84645799, 4. ],
[ 0.02369097, 0.63556261, 5. ]])
#可以将切片转换为数组
np.c_[1:6,-10:-5]
array([[ 1, -10],
[ 2, -9],
[ 3, -8],
[ 4, -7],
[ 5, -6]])
arr = np.arange(3)
arr
array([0, 1, 2])
arr.repeat(3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2])
arr.repeat([2,3,4])
array([0, 0, 1, 1, 1, 2, 2, 2, 2])
#多维数组可以在指定的轴向上对它们的元素进行重复
arr = np.random.randn(2,2)
arr
array([[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328]])
arr.repeat(2,axis=0)
array([[-0.86642515, -0.21137086],
[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328],
[ 0.4945539 , -0.02745328]])
#请注意,如果没有传递轴,数组将首先扁平化,这可能不是你想要的
arr.repeat(2)
array([-0.86642515, -0.86642515, -0.21137086, -0.21137086, 0.4945539 ,
0.4945539 , -0.02745328, -0.02745328])
#需要按照不同次数重复多维数组的切片时,你可以传递一个整数数组
arr.repeat([2,3],axis=0)
array([[-0.86642515, -0.21137086],
[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328],
[ 0.4945539 , -0.02745328],
[ 0.4945539 , -0.02745328]])
arr.repeat([2,3],axis=1)
array([[-0.86642515, -0.86642515, -0.21137086, -0.21137086, -0.21137086],
[ 0.4945539 , 0.4945539 , -0.02745328, -0.02745328, -0.02745328]])
#tile是一种快捷方法,它可以沿着轴向堆叠副本。在视觉上,你可以把它看作类似于“铺设瓷砖”
arr
array([[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328]])
np.tile(arr,2)
array([[-0.86642515, -0.21137086, -0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328, 0.4945539 , -0.02745328]])
np.tile(arr,(2,1))
array([[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328],
[-0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328]])
#tile的第二个参数可以是表示“铺瓷砖”布局的元组
np.tile(arr,(2,2))
array([[-0.86642515, -0.21137086, -0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328, 0.4945539 , -0.02745328],
[-0.86642515, -0.21137086, -0.86642515, -0.21137086],
[ 0.4945539 , -0.02745328, 0.4945539 , -0.02745328]])
arr = np.arange(10)*100
arr
array([ 0, 100, 200, 300, 400, 500, 600, 700, 800, 900])
inds = [7,1,2,6]
arr[inds]
array([700, 100, 200, 600])
arr.take(inds)
array([700, 100, 200, 600])
arr.put(inds,42)
arr
array([ 0, 42, 42, 300, 400, 500, 42, 42, 800, 900])
arr.put(inds,[40,41,42,43])
arr
array([ 0, 41, 42, 300, 400, 500, 43, 40, 800, 900])
#如果要在别的轴上使用take,你可以传递axis关键字
inds = [2,0,2,1]
arr = np.random.randn(2,4)
arr
array([[ 0.42067458, 1.11465134, 0.80097006, -0.37064359],
[-0.57974434, 1.24554556, 0.25903436, -0.10895085]])
arr.take(inds,axis=1)
array([[ 0.80097006, 0.42067458, 0.80097006, 1.11465134],
[ 0.25903436, -0.57974434, 0.25903436, 1.24554556]])
arr = np.arange(5)
arr
array([0, 1, 2, 3, 4])
#这里我们说标量值4已经被广播给乘法运算中的所有其他元素
arr*4
array([ 0, 4, 8, 12, 16])
arr = np.random.randn(4,3)
arr
array([[-0.26130828, 0.21031853, 0.09806178],
[-1.89409267, -0.30607457, 1.14174612],
[-0.04140891, -1.4256403 , 0.17503634],
[ 0.94815936, -0.47780023, -0.17362592]])
arr.mean(0)
array([-0.31216263, -0.49979914, 0.31030458])
demeaned = arr - arr.mean(0)
demeaned
array([[ 0.05085435, 0.71011767, -0.2122428 ],
[-1.58193004, 0.19372457, 0.83144154],
[ 0.27075371, -0.92584116, -0.13526824],
[ 1.26032198, 0.02199892, -0.4839305 ]])
demeaned.mean(0)
array([5.55111512e-17, 1.38777878e-17, 1.38777878e-17])
arr
array([[-0.26130828, 0.21031853, 0.09806178],
[-1.89409267, -0.30607457, 1.14174612],
[-0.04140891, -1.4256403 , 0.17503634],
[ 0.94815936, -0.47780023, -0.17362592]])
row_means = arr.mean(1)
row_means
array([ 0.01569068, -0.35280704, -0.43067096, 0.09891107])
row_means.shape
(4,)
row_means.reshape((4,1))
array([[ 0.01569068],
[-0.35280704],
[-0.43067096],
[ 0.09891107]])
demeaned = arr - row_means.reshape((4,1))
demeaned.mean(1)
array([4.62592927e-18, 7.40148683e-17, 7.40148683e-17, 0.00000000e+00])
arr - arr.mean(1)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-111-8b8ada26fac0> in <module>
----> 1 arr - arr.mean(1)
ValueError: operands could not be broadcast together with shapes (4,3) (4,)
arr - arr.mean(1).reshape((4,1))
array([[-0.27699896, 0.19462785, 0.0823711 ],
[-1.54128563, 0.04673247, 1.49455316],
[ 0.38926205, -0.99496934, 0.6057073 ],
[ 0.84924828, -0.5767113 , -0.27253699]])
#使用特殊的np.newaxis属性和“完整”切片来插入新轴
arr = np.zeros((4,4))
arr
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
arr_3d = arr[:,np.newaxis,:]
arr_3d
array([[[0., 0., 0., 0.]],
[[0., 0., 0., 0.]],
[[0., 0., 0., 0.]],
[[0., 0., 0., 0.]]])
arr_3d.shape
(4, 1, 4)
arr_1d = np.random.normal(size=3)
arr_1d[:,np.newaxis]
array([[-0.44142019],
[ 0.19138049],
[ 1.70465573]])
arr_1d[np.newaxis,:]
array([[-0.44142019, 0.19138049, 1.70465573]])
arr = np.random.randn(3,4,5)
arr
array([[[ 0.10223077, -1.53873895, -0.99946213, 0.71598751, -0.90498114], [-0.01548156, 0.30273138, 0.34831772, 1.64086735, 0.52801345], [-1.31620627, -0.79570758, -1.34854625, -2.63311809, -1.11911915], [-0.80136175, -1.94967438, -0.28787123, 0.33664872, 0.16180744]], [[ 1.77507844, -0.6858868 , -0.53739313, 1.33779554, 1.53855697], [ 1.9271013 , 0.58314326, -0.73893003, 0.67052899, -0.00530868], [-0.19838128, -0.92396483, -0.72747217, 0.8346707 , 0.44643892], [-0.37615445, 1.8688799 , -0.55484319, 0.50585597, -0.26799842]], [[ 0.57238033, -0.17529308, -0.72637569, -2.89489543, -0.01108801], [-0.17406094, -0.79553743, -0.64445857, -1.0084828 , 0.59183829], [-0.60375821, 0.15761849, 0.25371104, -0.60639911, -1.20483347], [ 0.70185761, -0.90187431, 0.45284624, -1.09157387, 0.70808834]]])
depth_means = arr.mean(2)
depth_means
array([[-0.52499279, 0.56088967, -1.44253947, -0.50809024],
[ 0.6856302 , 0.48730697, -0.11374173, 0.23514796],
[-0.64705438, -0.40614029, -0.40073225, -0.0261312 ]])
depth_means.shape
(3, 4)
demeaned = arr - depth_means[:,:,np.newaxis]
demeaned.mean(2)
array([[-2.22044605e-17, 4.44089210e-17, -2.22044605e-17,
-2.22044605e-17],
[-4.44089210e-17, -2.22044605e-17, 4.44089210e-17,
0.00000000e+00],
[-8.88178420e-17, -4.44089210e-17, 8.88178420e-17,
2.22044605e-17]])
def demean_axis(arr,axis=0):
means = arr.mean(axis)
indexer = [slice(None)]*arr.ndim
indexer[axis] = np.newaxis
return arr - means[indexer]
arr = np.zeros((4,3))
arr[:] = 5
arr
array([[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.]])
col = np.array([1.28,-0.42,0.44,1.6])
arr[:] = col[:,np.newaxis]
arr
array([[ 1.28, 1.28, 1.28],
[-0.42, -0.42, -0.42],
[ 0.44, 0.44, 0.44],
[ 1.6 , 1.6 , 1.6 ]])
arr[:2] = [[-1.37],[0.509]]
arr
array([[-1.37 , -1.37 , -1.37 ],
[ 0.509, 0.509, 0.509],
[ 0.44 , 0.44 , 0.44 ],
[ 1.6 , 1.6 , 1.6 ]])
方法 | 描述 |
---|---|
reduce (x) | 按操作的连续应用程序对数值聚合 |
accumulate (x) | 聚合值,保留所有部分聚合 |
reduceat (x,bins) | “本地” 缩聚或“group by",减少连续的数据切片以生成聚合数组 |
arr = np.arange(10)
arr
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
#reduce方法接收单个数组并通过执行一系列二元操作在可选的轴向上对数组的值进行聚合。
#起始值(对于add方法是0)取决于ufunc。如果传递了一个轴,则沿该轴执行缩聚。
np.add.reduce(arr)
45
arr.sum()
45
#使用np.logical_and来检查数组的每一行中的值是否被排序
np.random.seed(12346)
arr = np.random.randn(5,5)
arr
array([[-8.99822478e-02, 7.59372617e-01, 7.48336101e-01,
-9.81497953e-01, 3.65775545e-01],
[-3.15442628e-01, -8.66135605e-01, 2.78568155e-02,
-4.55597723e-01, -1.60189223e+00],
[ 2.48256116e-01, -3.21536673e-01, -8.48730755e-01,
4.60468309e-04, -5.46459347e-01],
[ 2.53915229e-01, 1.93684246e+00, -7.99504902e-01,
-5.69159281e-01, 4.89244731e-02],
[-6.49092950e-01, -4.79535727e-01, -9.53521432e-01,
1.42253882e+00, 1.75403128e-01]])
#对行排序
arr[::2].sort(1)
arr[:,:-1]
array([[-9.81497953e-01, -8.99822478e-02, 3.65775545e-01,
7.48336101e-01],
[-3.15442628e-01, -8.66135605e-01, 2.78568155e-02,
-4.55597723e-01],
[-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
4.60468309e-04],
[ 2.53915229e-01, 1.93684246e+00, -7.99504902e-01,
-5.69159281e-01],
[-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
1.75403128e-01]])
arr[:,1:]
array([[-8.99822478e-02, 3.65775545e-01, 7.48336101e-01,
7.59372617e-01],
[-8.66135605e-01, 2.78568155e-02, -4.55597723e-01,
-1.60189223e+00],
[-5.46459347e-01, -3.21536673e-01, 4.60468309e-04,
2.48256116e-01],
[ 1.93684246e+00, -7.99504902e-01, -5.69159281e-01,
4.89244731e-02],
[-6.49092950e-01, -4.79535727e-01, 1.75403128e-01,
1.42253882e+00]])
arr[:,:-1] < arr[:,1:]
array([[ True, True, True, True],
[False, True, False, False],
[ True, True, True, True],
[ True, False, True, True],
[ True, True, True, True]])
#请注意,logical_and.reduce等价于
np.logical_and.reduce(arr[:,:-1] < arr[:,1:],axis=1)
array([ True, False, True, False, True])
#accumulate与reduce是相关的,就像cumsum与sum相关一样。
#accumulate生成一个数组,其尺寸与中间“累计”值相同
arr = np.arange(15).reshape((3,5))
arr
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
np.add.reduce(arr,axis=1)
array([10, 35, 60])
np.add.accumulate(arr,axis=1)
array([[ 0, 1, 3, 6, 10],
[ 5, 11, 18, 26, 35],
[10, 21, 33, 46, 60]], dtype=int32)
np.add.reduce(arr,axis=0)
array([15, 18, 21, 24, 27])
np.add.accumulate(arr,axis=0)
array([[ 0, 1, 2, 3, 4],
[ 5, 7, 9, 11, 13],
[15, 18, 21, 24, 27]], dtype=int32)
#outer在两个数组之间执行成对的交叉乘积
arr = np.arange(3).repeat([1,2,2])
arr
array([0, 1, 1, 2, 2])
np.multiply.outer(arr,np.arange(5))
array([[0, 0, 0, 0, 0],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 2, 4, 6, 8],
[0, 2, 4, 6, 8]])
#outer的输出的维度等于输入的维度总和
x,y = np.random.randn(3,4),np.random.randn(5)
x
array([[-1.1049211 , 0.7239073 , -0.95465401, 0.24438966],
[-0.14528732, -0.12229477, 0.49165039, -1.55720967],
[ 0.11172771, -0.26132992, 0.27843076, -0.10798888]])
y
array([ 0.11090105, -0.37904993, 2.60555583, -1.02235214, 0.26172618])
result = np.subtract.outer(x,y)
result.shape
(3, 4, 5)
#reduceat方法接受一系列的“箱体边缘”,这些箱体边缘表示如何分隔以及聚合数据值
#结果是在arr[0:5]、arr[5:8]和arr[8:]上执行了缩聚(此处是加和)
arr = np.arange(10)
print(arr)
np.add.reduceat(arr,[0,5,8])
[0 1 2 3 4 5 6 7 8 9]
array([10, 18, 17], dtype=int32)
#你可以传递一个axis参数
arr = np.multiply.outer(np.arange(4),np.arange(5))
arr
array([[ 0, 0, 0, 0, 0],
[ 0, 1, 2, 3, 4],
[ 0, 2, 4, 6, 8],
[ 0, 3, 6, 9, 12]])
np.add.reduceat(arr,[0,2,4],axis=1)
array([[ 0, 0, 0],
[ 1, 5, 4],
[ 2, 10, 8],
[ 3, 15, 12]], dtype=int32)
def add_elements(x,y):
return x+y
#frompyfunc(func, nin, nout, *[, identity])
#第一个参数函数名,第二个表示输入,第三个表示输出
add_them = np.frompyfunc(add_elements,2,1)
add_them
<ufunc 'add_elements (vectorized)'>
add_them(np.arange(8),np.arange(8))
array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)
#另一个函数numpy.vectorize允许指定输出的类型(但功能稍差)
add_them = np.vectorize(add_elements,otypes=[np.float64])
add_them
<numpy.vectorize at 0x15fa8974a60>
add_them(np.arange(8),np.arange(8))
array([ 0., 2., 4., 6., 8., 10., 12., 14.])
arr = np.random.randn(10000)
%timeit add_them(arr,arr)
902 µs ± 6.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
#这种会快很多
%timeit np.add(arr,arr)
2.7 µs ± 2.16 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
dtype = [('x',np.float64),('y',np.int32)]
sarr = np.array([(1.5,6),(np.pi,-2)],dtype=dtype)
sarr
array([(1.5 , 6), (3.14159265, -2)],
dtype=[('x', '<f8'), ('y', '<i4')])
#一种典型的方式是使用(field_name, field_data_type)作为元组的列表
sarr[0]
(1.5, 6)
sarr[0]['y']
6
sarr['x']
array([1.5 , 3.14159265])
dtype = [('x',np.float64,3),('y',np.int32)]
arr = np.zeros(4,dtype=dtype)
arr
array([([0., 0., 0.], 0), ([0., 0., 0.], 0), ([0., 0., 0.], 0),
([0., 0., 0.], 0)], dtype=[('x', '<f8', (3,)), ('y', '<i4')])
#在这种情况下,x字段引用的是每条记录中长度为3的数组
arr[0]['x']
array([0., 0., 0.])
arr['x']
array([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
dtype = [('x',[('a','f8'),('b','f4')]),('y',np.int32)]
data = np.array([((1,2),5),((3,4),6)],dtype=dtype)
data['x']
array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])
data['y']
array([5, 6])
data['x']['a']
array([1., 3.])
arr = np.random.randn(6)
arr
array([ 0.51034093, -1.21799778, -0.27034648, -1.33534252, -0.78528729,
-1.10908521])
arr.sort()
arr
array([-1.33534252, -1.21799778, -1.10908521, -0.78528729, -0.27034648,
0.51034093])
#在进行数组原位排序时,请记住如果数组是不同ndarray的视图的话,原始数组将会被改变
arr = np.random.randn(3,5)
arr
array([[-0.00369513, -0.15297778, -0.46090167, -0.42008296, -0.91017112],
[-1.05144731, 1.41433111, 0.22343751, 1.98200412, -0.11843381],
[-1.71099598, -0.77901664, 1.9175701 , -0.36801273, 0.35893302]])
#对第一列的值原位排序
arr[:,0].sort()
#只有第一列数据排序有变化
arr
array([[-1.71099598, -0.15297778, -0.46090167, -0.42008296, -0.91017112],
[-1.05144731, 1.41433111, 0.22343751, 1.98200412, -0.11843381],
[-0.00369513, -0.77901664, 1.9175701 , -0.36801273, 0.35893302]])
#numpy.sort产生的是一个数组的新的、排序后的副本
arr = np.random.randn(5)
arr
array([ 0.83175214, 0.0981957 , -0.16337765, 1.57507692, 1.20540736])
np.sort(arr)
array([-0.16337765, 0.0981957 , 0.83175214, 1.20540736, 1.57507692])
#经过np.sort()数组是不会改变的
arr
array([ 0.83175214, 0.0981957 , -0.16337765, 1.57507692, 1.20540736])
#所有这些排序方法都有一个axis参数,用于独立地沿着传递的轴对数据部分进行排序
arr = np.random.randn(3,5)
arr
array([[ 0.48623846, 1.40501429, 0.21771959, -0.6147521 , -1.03729051],
[ 0.00466416, 1.31854631, -0.09256828, -1.03503114, 0.70669487],
[-0.06967569, -0.55095404, 0.87325007, -1.9579896 , -0.10276109]])
#按照行排序,会改变arr
arr.sort(axis=1)
arr
array([[-1.03729051, -0.6147521 , 0.21771959, 0.48623846, 1.40501429],
[-1.03503114, -0.09256828, 0.00466416, 0.70669487, 1.31854631],
[-1.9579896 , -0.55095404, -0.10276109, -0.06967569, 0.87325007]])
arr.sort(axis=0)
#按照行排序,会改变arr
arr
array([[-1.9579896 , -0.6147521 , -0.10276109, -0.06967569, 0.87325007],
[-1.03729051, -0.55095404, 0.00466416, 0.48623846, 1.31854631],
[-1.03503114, -0.09256828, 0.21771959, 0.70669487, 1.40501429]])
arr[:,::-1]
array([[ 0.87325007, -0.06967569, -0.10276109, -0.6147521 , -1.9579896 ],
[ 1.31854631, 0.48623846, 0.00466416, -0.55095404, -1.03729051],
[ 1.40501429, 0.70669487, 0.21771959, -0.09256828, -1.03503114]])
values = np.array([5,0,1,3,2])
indexer = values.argsort()
indexer
array([1, 2, 4, 3, 0], dtype=int64)
values[indexer]
array([0, 1, 2, 3, 5])
#对一个二维数组按照它的第一行进行重新排序
arr = np.random.randn(3,5)
arr[0] = values
arr
array([[ 5. , 0. , 1. , 3. , 2. ],
[ 1.01782863, -1.18082614, 0.66861266, -1.51142124, -0.91934196],
[ 1.16468714, 0.12410901, 1.69151564, 0.8931546 , 0.16763928]])
arr[:,arr[0].argsort()]
array([[ 0. , 1. , 2. , 3. , 5. ],
[-1.18082614, 0.66861266, -0.91934196, -1.51142124, 1.01782863],
[ 0.12410901, 1.69151564, 0.16763928, 0.8931546 , 1.16468714]])
#lexsort类似于argsort,但它对多键数组执行间接字典排序
first_name = np.array(['Bob','Jane','Steve','Bill','Barbara'])
last_name = np.array(['Jone','Arnold','Arnold','Jone','Walters'])
sorter = np.lexsort((first_name,last_name))
sorter
array([1, 2, 3, 0, 4], dtype=int64)
first_name[sorter]
array(['Jane', 'Steve', 'Bill', 'Bob', 'Barbara'], dtype='<U7')
last_name[sorter]
array(['Arnold', 'Arnold', 'Jone', 'Jone', 'Walters'], dtype='<U7')
zip(first_name[sorter],last_name[sorter])
<zip at 0x15fa89182c0>
种类 | 速度 | 是否稳定 | 工作空间 | 最差情况 |
---|---|---|---|---|
quicksort | 1 | No | 0 | 0 (n^2) |
mergesort | 2 | Yes | n/2 | 0(n 1og n) |
heapsort | 3 | No | 0 | (n 1og n) |
values = np.array(['2:first','2:second','1:first','1:second','1:third'])
key = np.array([2,2,1,1,1])
indexer = key.argsort(kind='mergesort')
indexer
array([2, 3, 4, 0, 1], dtype=int64)
values.take(indexer)
array(['1:first', '1:second', '1:third', '2:first', '2:second'],
dtype='<U8')
np.random.seed(12345)
arr = np.random.randn(20)
arr
array([-0.20470766, 0.47894334, -0.51943872, -0.5557303 , 1.96578057,
1.39340583, 0.09290788, 0.28174615, 0.76902257, 1.24643474,
1.00718936, -1.29622111, 0.27499163, 0.22891288, 1.35291684,
0.88642934, -2.00163731, -0.37184254, 1.66902531, -0.43856974])
#在调用partition(arr,3)之后,结果中的前三个元素是最小的三个值,并不是特定的顺序。
np.partition(arr,3)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
-0.43856974, -0.20470766, 0.28174615, 0.76902257, 0.47894334,
1.00718936, 0.09290788, 0.27499163, 0.22891288, 1.35291684,
0.88642934, 1.39340583, 1.96578057, 1.66902531, 1.24643474])
#numpy.argpartition类似于numpy.argsort排序,它返回的是将数据重新排列为等价顺序的索引
indices = np.argpartition(arr,3)
indices
array([16, 11, 3, 2, 17, 19, 0, 7, 8, 1, 10, 6, 12, 13, 14, 15, 5,
4, 18, 9], dtype=int64)
arr.take(indices)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
-0.43856974, -0.20470766, 0.28174615, 0.76902257, 0.47894334,
1.00718936, 0.09290788, 0.27499163, 0.22891288, 1.35291684,
0.88642934, 1.39340583, 1.96578057, 1.66902531, 1.24643474])
arr = np.array([0,1,7,12,15])
arr.searchsorted(9)
3
#你还可以传递一个值数组来获取一个索引数组
#对于0元素,searchsorted返回0。这是因为默认行为是返回一组相等值左侧的索引
arr.searchsorted([0,8,11,16])
array([0, 3, 3, 5], dtype=int64)
arr = np.array([0,0,0,1,1,1,1])
arr.searchsorted([0,1])
array([0, 3], dtype=int64)
arr.searchsorted([0,1],side='right')
array([3, 7], dtype=int64)
#作为searchsorted的另一个应用,假设我们有一个介于0和10,000之间的数值,
#以及我们想用来分隔数据的单独的“桶边界”数组
data = np.floor(np.random.uniform(0,10000,size=50))
data
array([9940., 6768., 7908., 1709., 268., 8003., 9037., 246., 4917.,
5262., 5963., 519., 8950., 7282., 8183., 5002., 8101., 959.,
2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
4954., 3516., 7142., 5039., 2256.])
bins = np.array([0,100,1000,5000,100000])
labels = bins.searchsorted(data)
labels
array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
4, 3, 3, 4, 4, 3], dtype=int64)
#可以和pandas的groupby一起被用于分箱数据
pd.Series(data).groupby(labels).count()
2 4
3 18
4 28
dtype: int64
#该函数使用for循环计算表达式(x - y).mean()的值
import numpy as np
def mean_distance(x,y):
nx = len(x)
result = 0.0
count = 0
for i in range(nx):
result+= x[i] - y[i]
count+=1
return result/count
x = np.random.randn(1000000)
y = np.random.randn(1000000)
%timeit mean_distance(x,y)
232 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit (x-y).mean()
1.51 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
#可以使用numba.jit函数将这个函数编译成Numba函数
import numba as nb
numba_mean_distance = nb.jit(mean_distance)
#可以写成装饰器的形式
@nb.jit
def mean_distance(x,y):
nx = len(x)
result = 0.0
count = 0
for i in range(nx):
result+= x[i] - y[i]
count+=1
return result/count
%timeit numba_mean_distance(x,y)
670 µs ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
from numba import float64,njit
@njit(float64(float64[:],float64[:]))
def mean_distance(x,y):
return (x-y).mean()
from numba import vectorize
@vectorize
def nb_add(x,y):
return x+y
x = np.arange(10)
nb_add(x,x)
array([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18], dtype=int64)
mmap = np.memmap('mymmap',dtype='float64',mode='w+',shape=(10000,10000))
mmap
memmap([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
#对memmap切片返回的是硬盘上数据的视图
section = mmap[:5]
section
memmap([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
#如果你将数据赋值给这些切片,它将会在内存中缓冲(类似于一个Python文件对象),但你可以调用flush将数据写入硬盘
section[:] = np.random.randn(5,10000)
mmap.flush()
mmap
memmap([[ 0.41110843, 0.58204806, 1.2463012 , ..., 0.06582078,
-0.34734378, 0.62280733],
[-2.21583571, 0.29678775, 0.57086919, ..., 0.07007184,
-0.26204433, -0.30061136],
[ 0.77817885, 0.74008809, 0.49653126, ..., -0.51072764,
1.11806763, 0.09285284],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
del mmap
mmap = np.memmap('mymmap',dtype='float64',shape=(10000,10000))
mmap
memmap([[ 0.41110843, 0.58204806, 1.2463012 , ..., 0.06582078,
-0.34734378, 0.62280733],
[-2.21583571, 0.29678775, 0.57086919, ..., 0.07007184,
-0.26204433, -0.30061136],
[ 0.77817885, 0.74008809, 0.49653126, ..., -0.51072764,
1.11806763, 0.09285284],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
arr_c = np.ones((1000,1000),order='c')
arr_f = np.ones((1000,1000),order='F')
arr_c.flags
C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
arr_f.flags
C_CONTIGUOUS : False
F_CONTIGUOUS : True
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
%timeit arr_c.sum(1)
241 µs ± 1.42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit arr_f.sum(1)
238 µs ± 589 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
arr_f.copy('C').flags
C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
#在数组上构建视图时,请记住结果并不能保证是连续的
arr_c[:50].flags.contiguous
True
arr_c[:,:50].flags
C_CONTIGUOUS : False
F_CONTIGUOUS : False
OWNDATA : False
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
2**27
134217728
_
134217728
foo = 'bar'
foo
'bar'
_i137
"foo = 'bar'\nfoo"
_137
'bar'
#由于输入变量是字符串,因此可以使用Python exec关键字再次执行它们
exec(_i27)
%hist
命令 | 描述 |
---|---|
! cmd | 在系统命令行中执行cmd命令 |
output = !cmd args | 运行cmd并在output中保存stdout |
%alias alias_ name cmd | 为系统(shel) 命令定义别名 |
%bookmark | 使用IPython的目录书签系统 |
%cd directory | 将系统工作目录更改为传递的目录 |
%pwd | 返回当前工作目录 |
%pushd directory | 将当前目录放在堆栈上并更改为目标目录 |
%popd | 切换到堆栈顶部弹出的目录 |
%dirs | 返回包含当前目录堆栈的列表 |
%dhist | 打印访问目录的历史记录 |
%env | 以字典形式返回系统环境变量 |
%matplotlib | 配置matplotlib集成选项 |
命令 | 动作 |
---|---|
h(e1p) | 展示命令列表 |
help command | 显示conmand命令的文档 |
c(continue) | 恢复程序执行 |
q(uit) | 退出调试器而不再执行更多的代码 |
b(reak )number | 在当前文件的number位置设置断点 |
b path/ to/file. py:number | 在指定文件的number位置设置断点 |
s(tep) | 单步进入函数调用 |
n(ext) | 执行当前行,并进入到当前层级的下一行 |
u§/d( own) | 在函数调用堆栈中上下移动 |
a(rgs) | 显示当前函数的参数 |
debug statement | 在新的(递归)调试器中调用语句statement |
l(ist)statement | 显示当前堆栈的当前位置和上下文 |
W(here) | 在当前位置打印带有上下文的完整堆栈回溯 |
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。