赞
踩
# plotnine:python中的ggplot2 import plotnine as pn from plotnine import data import numpy as np import pandas as pd # 统计分析 from scipy import stats import statsmodels.api as sm from statsmodels.formula.api import ols, glm, poisson import copy # 阻止pandas产生warnings(提示DataFrame的相关操作生成引用or副本) import warnings warnings.filterwarnings("ignore") # jupyter中同一个cell的多个结果均自动输出,不需挨个手工print from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all"
选取mtcars的子集df,共32个记录,6个变量
df = data.mtcars[["wt", "mpg", "cyl", "vs", "am", "gear"]]
df.shape
df.dtypes
print(df.head())
(32, 6) wt float64 mpg float64 cyl int64 vs int64 am int64 gear int64 dtype: object wt mpg cyl vs am gear 0 2.620 21.0 6 0 1 4 1 2.875 21.0 6 0 1 4 2 2.320 22.8 4 1 1 4 3 3.215 21.4 6 1 0 3 4 3.440 18.7 8 0 0 3
将变量vs、am、gear由数值型连续变量转为字符型分类变量
df["vs"] = df["vs"].astype(str)
df["am"] = df["am"].astype(str)
df["gear"] = df["gear"].astype(str)
df.dtypes
wt float64
mpg float64
cyl int64
vs object
am object
gear object
dtype: object
变量分布
# 连续变量的常见分布统计量
print(df.describe())
# 分类变量的类别及频数
df["vs"].value_counts()
df["am"].value_counts()
df["gear"].value_counts()
wt mpg cyl count 32.000000 32.000000 32.000000 mean 3.217250 20.090625 6.187500 std 0.978457 6.026948 1.785922 min 1.513000 10.400000 4.000000 25% 2.581250 15.425000 4.000000 50% 3.325000 19.200000 6.000000 75% 3.610000 22.800000 8.000000 max 5.424000 33.900000 8.000000 0 18 1 14 Name: vs, dtype: int64 0 19 1 13 Name: am, dtype: int64 3 15 4 12 5 5 Name: gear, dtype: int64
plotnine是python的一个绘图库,模仿了ggplot2的语法和绘图样式,如果熟悉R的ggplot2,那么该库可快速上手。
官方文档
该库与R的ggplot2使用中主要有以下2点不同:
ggplot()
的mapping参数,R中写为aes(x = mpg, y = wt)
,plotnine写为aes(x="mpg", y="wt")
\
连接或者语句首尾加括号以下是几个简单的例子
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_classic()
<ggplot: (-9223371901789411352)>
按照vs分组,分别绘制散点图、回归线及其95%置信区间
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt", color="vs")) + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_classic()
<ggplot: (-9223371901789391480)>
按照变量vs和gear进行分面
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) + \
pn.facet_grid("vs ~ gear") + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_xkcd()
<ggplot: (-9223371901789166508)>
各变量的缺失值
df.apply(lambda x: sum(x.isnull())).sort_values()
wt 0
mpg 0
cyl 0
vs 0
am 0
gear 0
count 0
dtype: int64
连续变量和分类变量的分布信息
print(df.describe())
# count:分类变量非缺失值的数量
# unique:分类变量唯一值的数量
# top:分类变量中出现频次最高的数
# freq:分类变量中出现频次最高的数出现了几次
print(df.describe(include="object"))
wt mpg cyl
count 32.000000 32.000000 32.000000
mean 3.217250 20.090625 6.187500
std 0.978457 6.026948 1.785922
min 1.513000 10.400000 4.000000
25% 2.581250 15.425000 4.000000
50% 3.325000 19.200000 6.000000
75% 3.610000 22.800000 8.000000
max 5.424000 33.900000 8.000000
vs am gear
count 32 32 32
unique 2 2 3
top 0 0 3
freq 18 19 15
# 频次
df["vs"].value_counts()
# 构成比
df["vs"].value_counts(normalize=True)
0 18
1 14
Name: vs, dtype: int64
0 0.5625
1 0.4375
Name: vs, dtype: float64
# 方差
np.var(df["wt"])
# 标准差
np.std(df["wt"])
0.927460875
0.9630477013107918
# 众数
stats.mode(df["wt"])
ModeResult(mode=array([3.44]), count=array([3]))
# 偏度
stats.skew(df["wt"])
0.44378553550607736
# 峰度
stats.kurtosis(df["wt"])
0.1724705401587343
正态分布样本均值的标准差(即标准误)及其95%置信区间
# 标准误
se = np.std(df["wt"]) / np.sqrt(len(df["wt"]))
se
# 区间上下限
np.mean(df["wt"]) - 1.96 * se
np.mean(df["wt"]) + 1.96 * se
0.17024439005074438
2.883570995500541
3.550929004499459
p = 0.09,在0.05的显著性水平下接受原假设,即未发现变量wt不符合正态分布
stats.shapiro(df["wt"])
ShapiroResult(statistic=0.9432578682899475, pvalue=0.09265592694282532)
p = 0.38,未发现方差不齐
stats.bartlett(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
BartlettResult(statistic=0.7611752294629192, pvalue=0.38296098768025166)
p = 0.001,两总体均值存在显著性差异
# 方差齐性检验显示方差齐,故equal_var=True;若不齐,需要使用校正后的t检验(指定equal_var=False)或使用秩和检验
stats.ttest_ind(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"], equal_var=True)
Ttest_indResult(statistic=3.653533152238974, pvalue=0.0009798492309250216)
df_copy = copy.deepcopy(df)
df_copy.loc[:16, "group"] = "0"
df_copy.loc[16:, "group"] = "1"
df_copy["group"].value_counts()
0 16
1 16
Name: group, dtype: int64
stats.ttest_rel(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
Ttest_relResult(statistic=2.0782957471723527, pvalue=0.05526298801644364)
关于独立样本还是相关样本,举个例子:
一项RCT(随机对照实验)想研究药物A和药物B对老年人的降压效果:研究分A组和B组两组,两组各用药14天,分别在基线(入组时)和14天时测血压值。对于A组或B组内部来说,想研究用药前后的血压值是否存在显著性差异,可使用两相关样本t检验(相当于比较前后差值的均值是否显著异于0);如果想比较A组和B组的降压效果是否存在显著性差异,可先分别求得A组和B组血压前后的变化值,然后对二组的变化值进行独立样本t检验【其他方法如协方差分析,基线血压作为协变量】。
stats.ranksums(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
RanksumsResult(statistic=3.2668698585096214, pvalue=0.0010874365767340446)
stats.wilcoxon(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
WilcoxonResult(statistic=27.0, pvalue=0.033538818359375)
先拟合回归模型,再对回归模型进行方差分析得到方差分析表
p = 0.0003,说明三组间均值存在显著性差异
print(sm.stats.anova_lm(ols("wt ~ C(gear)", data = df).fit()))
df sum_sq mean_sq F PR(>F)
C(gear) 2.0 12.878947 6.439473 11.115889 0.000261
Residual 29.0 16.799801 0.579303 NaN NaN
进一步进行多重比较(参考文章),发现3组和4组、3组和5组间存在显著性差异
mc = sm.stats.multicomp.MultiComparison(df["wt"], df["gear"])
print(mc.tukeyhsd())
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
3 4 -1.2759 0.001 -2.0038 -0.5481 True
3 5 -1.26 0.0089 -2.2305 -0.2895 True
4 5 0.0159 0.9 -0.9844 1.0163 False
----------------------------------------------------
方法使用选择:(n为总样本数,E为每个单元格的理论频数)
print(pd.crosstab(df["vs"], df["am"]))
am 0 1
vs
0 12 6
1 7 7
由于n = 32 < 40,应直接使用Fisher确切概率法;这里使用校正卡方检验(correction = True)仅为显示其用法
stats.chi2_contingency(pd.crosstab(df["vs"], df["am"]), correction = True)
(0.34753550543024225,
0.5555115470131495,
1,
array([[10.6875, 7.3125],
[ 8.3125, 5.6875]]))
Fisher确切概率法:p = 0.47,未发现两变量分布存在显著性关联
stats.fisher_exact(pd.crosstab(df["vs"], df["am"]))
(2.0, 0.4726974416017807)
stats.pearsonr(df["wt"], df["mpg"])
(-0.8676593765172278, 1.2939587013505124e-10)
stats.spearmanr(df["wt"], df["mpg"])
SpearmanrResult(correlation=-0.8864220332702978, pvalue=1.487594858127497e-11)
lm.summary()
返回大量信息,各变量回归系数为coef。如mpg的回归系数为-0.1191且p < 0.05,意味着在保持cyl不变的情况下,mpg每减少1个单位,wt相应减少0.12 (95%CI, -0.18 ~ -0.06)个单位。
lm = ols("wt ~ mpg + cyl", df).fit()
print(lm.summary())
OLS Regression Results ============================================================================== Dep. Variable: wt R-squared: 0.760 Model: OLS Adj. R-squared: 0.743 Method: Least Squares F-statistic: 45.82 Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.05e-09 Time: 17:06:33 Log-Likelihood: -21.393 No. Observations: 32 AIC: 48.79 Df Residuals: 29 BIC: 53.18 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 5.0760 1.117 4.544 0.000 2.791 7.361 mpg -0.1191 0.028 -4.216 0.000 -0.177 -0.061 cyl 0.0863 0.095 0.905 0.373 -0.109 0.281 ============================================================================== Omnibus: 5.603 Durbin-Watson: 0.987 Prob(Omnibus): 0.061 Jarque-Bera (JB): 4.488 Skew: 0.910 Prob(JB): 0.106 Kurtosis: 3.234 Cond. No. 277. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
wt的回归拟合值
lm.predict(df)
0 3.092788 1 3.092788 2 2.705930 3 3.045155 4 3.539185 5 3.438123 6 4.063142 7 2.515401 8 2.705930 9 3.307134 10 3.473847 11 3.813072 12 3.705899 13 3.955969 14 4.527559 15 4.527559 16 4.015510 17 1.562751 18 1.800914 19 1.384130 20 2.860736 21 3.920245 22 3.955969 23 4.182224 24 3.479645 25 2.170065 26 2.324871 27 1.800914 28 3.884521 29 3.247593 30 3.979786 31 2.872644 dtype: float64
定性变量写入C()内,reference指定参照水平
如回归系数0.0738意味着,保持mpg不变的情况下,vs = 1组的wt相比于vs = 0组的wt的均值高出0.07个单位,但p值不显著(一般情况下先看p值,若p值显著大于设定的显著性水平如p = 0.65,则不必再看回归系数)
lm = ols("wt ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results ============================================================================== Dep. Variable: wt R-squared: 0.754 Model: OLS Adj. R-squared: 0.737 Method: Least Squares F-statistic: 44.36 Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.51e-09 Time: 17:06:33 Log-Likelihood: -21.786 No. Observations: 32 AIC: 49.57 Df Residuals: 29 BIC: 53.97 Df Model: 2 Covariance Type: nonrobust ====================================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------------------ Intercept 6.0973 0.353 17.274 0.000 5.375 6.819 C(vs, Treatment(reference=0))[T.1] 0.0738 0.239 0.308 0.760 -0.416 0.563 mpg -0.1450 0.020 -7.243 0.000 -0.186 -0.104 ============================================================================== Omnibus: 5.895 Durbin-Watson: 1.108 Prob(Omnibus): 0.052 Jarque-Bera (JB): 4.733 Skew: 0.932 Prob(JB): 0.0938 Kurtosis: 3.279 Cond. No. 89.3 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
对因变量进行变换,此处相当于先对wt取对数获得变量log_wt,再利用log_wt对其它变量回归
mpg的回归系数-0.0495可解释为:mpg每增加1个单位,wt减少
[
e
−
0.05
−
1
]
×
100
%
[e^{-0.05} -1]\times 100\%
[e−0.05−1]×100%,p值显著
lm = ols("np.log(wt) ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results ============================================================================== Dep. Variable: np.log(wt) R-squared: 0.812 Model: OLS Adj. R-squared: 0.799 Method: Least Squares F-statistic: 62.66 Date: Sun, 19 Jul 2020 Prob (F-statistic): 2.97e-11 Time: 17:06:33 Log-Likelihood: 18.563 No. Observations: 32 AIC: -31.13 Df Residuals: 29 BIC: -26.73 Df Model: 2 Covariance Type: nonrobust ====================================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------------------ Intercept 2.0995 0.100 20.988 0.000 1.895 2.304 C(vs, Treatment(reference=0))[T.1] 0.0371 0.068 0.547 0.588 -0.102 0.176 mpg -0.0495 0.006 -8.724 0.000 -0.061 -0.038 ============================================================================== Omnibus: 2.197 Durbin-Watson: 1.607 Prob(Omnibus): 0.333 Jarque-Bera (JB): 1.872 Skew: 0.471 Prob(JB): 0.392 Kurtosis: 2.282 Cond. No. 89.3 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
对自变量进行变换,此处相当于对mpg取平方后获得变量mpg2,再使用mpg2参与回归
回归方程可写为
w
t
^
=
4.53
−
0.003
×
m
p
g
2
−
0.10
×
v
s
\hat {wt} = 4.53 - 0.003\times mpg^{2} - 0.10\times vs
wt^=4.53−0.003×mpg2−0.10×vs
lm = ols("wt ~ np.square(mpg) + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results ============================================================================== Dep. Variable: wt R-squared: 0.680 Model: OLS Adj. R-squared: 0.658 Method: Least Squares F-statistic: 30.78 Date: Sun, 19 Jul 2020 Prob (F-statistic): 6.75e-08 Time: 17:06:33 Log-Likelihood: -25.982 No. Observations: 32 AIC: 57.96 Df Residuals: 29 BIC: 62.36 Df Model: 2 Covariance Type: nonrobust ====================================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------------------ Intercept 4.5263 0.198 22.906 0.000 4.122 4.930 C(vs, Treatment(reference=0))[T.1] -0.0965 0.265 -0.364 0.718 -0.638 0.445 np.square(mpg) -0.0029 0.000 -5.803 0.000 -0.004 -0.002 ============================================================================== Omnibus: 6.860 Durbin-Watson: 0.883 Prob(Omnibus): 0.032 Jarque-Bera (JB): 5.754 Skew: 1.027 Prob(JB): 0.0563 Kurtosis: 3.307 Cond. No. 1.35e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.35e+03. This might indicate that there are strong multicollinearity or other numerical problems.
Binomial指定因变量为二项分布,logit指定连接函数为logit: l o g i t ( p ) = l n ( p 1 − p ) logit(p) = ln(\frac{p}{1 - p}) logit(p)=ln(1−pp)
logit = glm("vs ~ mpg + C(am, Treatment(reference=0))", df,
family=sm.families.Binomial(sm.families.links.logit)).fit()
print(logit.summary())
Generalized Linear Model Regression Results ============================================================================== Dep. Variable: ['vs[0]', 'vs[1]'] No. Observations: 32 Model: GLM Df Residuals: 29 Model Family: Binomial Df Model: 2 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -10.323 Date: Sun, 19 Jul 2020 Deviance: 20.646 Time: 17:06:33 Pearson chi2: 20.2 No. Iterations: 6 Covariance Type: nonrobust ====================================================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------------------------------ Intercept 12.7051 4.625 2.747 0.006 3.640 21.770 C(am, Treatment(reference=0))[T.1] 3.0073 1.599 1.880 0.060 -0.128 6.142 mpg -0.6809 0.252 -2.698 0.007 -1.176 -0.186 ======================================================================================================
Logistic回归中各变量回归系数经自然指数转化后可解释为OR(Odds Ratio,比值比或优势比),当结局发生率较低时(<10%)可用作RR的估计值反映暴露变量效应大小:如am的回归系数为3.0073,意味着保持mpg不变时,am = 1组的结局发生风险是am = 0组的 e 3.01 = 20.23 e^{3.01} = 20.23 e3.01=20.23倍,p值边缘不显著(p = 0.06)。【由于vs的结局发生率40%多,此处使用Logistic回归估计OR值反应暴露变量效应是不合适的,此处可使用Log-binomial模型估计现患比PR】
np.exp(logit.params[1:])
C(am, Treatment(reference=0))[T.1] 20.232169
mpg 0.506151
dtype: float64
泊松回归的因变量为单位时间或单位空间事件数的发生数,为计数变量(正整数)
# 生成符合泊松分布的因变量count,均值为5
df["count"] = np.random.poisson(lam=5, size=32)
df["count"].value_counts()
5 11
4 5
3 4
6 3
2 3
10 2
8 2
13 1
7 1
Name: count, dtype: int64
poi = poisson("count ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(poi.summary())
Optimization terminated successfully. Current function value: 2.174190 Iterations 5 Poisson Regression Results ============================================================================== Dep. Variable: count No. Observations: 32 Model: Poisson Df Residuals: 29 Method: MLE Df Model: 2 Date: Sun, 19 Jul 2020 Pseudo R-squ.: 0.02153 Time: 17:06:33 Log-Likelihood: -69.574 converged: True LL-Null: -71.105 Covariance Type: nonrobust LLR p-value: 0.2163 ====================================================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------------------------------ Intercept 2.0939 0.313 6.691 0.000 1.481 2.707 C(vs, Treatment(reference=0))[T.1] -0.0290 0.211 -0.137 0.891 -0.443 0.385 mpg -0.0218 0.018 -1.199 0.230 -0.057 0.014 ======================================================================================================
泊松回归中各变量回归系数经自然指数转化后可解释为RR(Relative Risk,相对危险度),关联性研究中反映了暴露变量效应大小
如vs的回归系数为-0.0290,意味着保持mpg不变时,vs = 1组的结局发生风险是vs = 0组的
e
−
0.03
=
0.98
e^{-0.03} = 0.98
e−0.03=0.98倍,但p值不显著(p = 0.89)
np.exp(poi.params[1:])
C(vs, Treatment(reference=0))[T.1] 0.971449
mpg 0.978414
dtype: float64
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。