赞
踩
logistic回归是最常用的多因素分析方法之一,主要应用在两个方面,一个是筛选独立的危险因素,另一个就是构建临床预测模型。我们把最常用的logistic回归相关的R代码进行了整理,相关的参考资料也给出了链接,方便大家进行进一步的阅读学习,希望能帮助大家更快的进行统计分析工作。
- # 使用软件 R-4.2.0
- #一、使用数据集
- library(survival)
- #data(lung) # 加载lung数据集
- mydata<-na.omit(survival::lung) #去除缺失值
- #View(mydata) # 查看数据集
- str(mydata)
##########################################################
#二、数据字段说明
- #survival包的lung数据集
- #lung数据集:NCCTG晚期肺癌患者的生存率。
- #inst:机构代码;
- #time:生存天数(以天为单位的生存时间);
- #status:生存状态,1为删失,2为死亡;
- #age:年龄;sex # 性别,1为男性,2为女性;
- #ph.ecog、ph.karno、pat.karno # 为病人和患者评分;
- #ph.ecog:ECOG评分(0 =好,5 =死亡)
- #ph.karno:医师进行的Karnofsky评分(0 = 差,100 = 好)
- #pat.karno:患者自行进行的Karnofsky评分(0 = 差,100 = 好)
- #meal.cal:用餐时消耗的卡路里;
- #wt.loss:最近6个月的体重下降。
#三、去除inst和time两个变量
- mydata<-mydata[,-c(1,2)]
- mydata$status<-ifelse(mydata$status==2,1,0)
- #mydata$sex=ifelse(mydata$sex=='f',1,0)
- str(mydata)
#以status为因变量,以其他变量为自变量构建logistic回归模型
################
#四、拆分数据集,训练集和测试集
- #install.packages("caret")
- library(caret)
- set.seed(300)
- trainid<-createDataPartition(y=mydata$status,p=0.70,list=F)
- traindata<-mydata[trainid, ]
- testdata<-mydata[-trainid,]
#四、快速统计描述
- #install.packages("PerformanceAnalytics")
- library(PerformanceAnalytics)
- chart.Correlation(mydata[,c(2,4:8)], histogram=TRUE,
- pch=19,method = 'spearman')
#五、快速组间比较
- #参考:https://mp.weixin.qq.com/s/9yY8UIyEVqkiLhHJrr0Row
- #install.packages("scitb")
- library(scitb)
- allVars<-c("age", "sex","ph.ecog","ph.karno", "pat.karno", "meal.cal", "wt.loss")
- fvars<-c("sex","ph.ecog")
- strata<-"status"
- table1<-scitb1(vars=allVars,fvars=fvars,strata=strata,data=mydata) #自动检验正态性
- table1<-scitb1(vars=allVars,fvars=fvars,strata=strata,data=mydata,
- nonnormal=c("meal.cal"),atotest=F) #指定非正态
- table1
- write.csv(table1,file= "1.csv",row.names = F) #导出table1
########################
#四、基线特征描述统计
- #install.packages("autoReg")
- library(autoReg)
- library(dplyr)
- table1=gaze(status~.,data=mydata) %>% myft()
- table1
- table2=gaze(status~.,data=mydata,method=3) %>% myft()
- table2
- #install.packages("rrtable")
- library(rrtable)
- table2pptx(table1) #Exported table as Report.pptx
- table2docx(table2) #Exported table as Report.docx
#六、批量单因素分析
- #多个变量因子化,运行autoReg命令前,把分类变量因子化
- #mydata[,c("sex","ph.ecog")]<-lapply(mydata[,c("sex","ph.ecog")],as.factor)
- mydata$sex.factor=factor(mydata$sex,labels=c("Female","Male"))
- mydata$ph.ecog.factor=factor(mydata$ph.ecog)
- fit<- glm(status~sex.factor+age+ph.ecog.factor+ph.karno+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),
- data = mydata)
- ##logistic回归结果快速整理
- # https://cloud.tencent.com/developer/article/2146471
- #install.packages("autoReg")
- library(autoReg)
- autoReg(fit,uni=TRUE,threshold=0.05)
- autoReg(fit)
- result<-autoReg(fit, uni=TRUE) %>% myft()
- result
- library(rrtable)
- table2pptx(coxresult) #Exported table as Report.pptx
- table2docx(coxresult) #Exported table as Report.docx
- #对比一下结果
- fit2<-glm(status~sex.factor,family=binomial(link = "logit"),
- data = mydata)
- summary(fit2)
- exp(coefficients(fit2))
####临床预测模型评价指标
- #生成预测值
- fit3<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata)
- #在建模人群中计算预测值,生成新变量
- traindata$predict<- predict(newdata=traindata,fit3,"response")
- #在验证人群计算预测值,命名为predict
- testdata$predict<- predict(newdata=testdata,fit3,"response")
- #######ROC曲线下面积的计算#############################
- #参考:https://zhuanlan.zhihu.com/p/562605972
- #参考:https://mp.weixin.qq.com/s/mCwWuRK7gQy-79rgXi2ZRA
- library(pROC)
- #利用pROC绘制ROC要注意direction参数的设置,如果算出来auc<0.05,可以设置为controls>cases,
- #较小的指标结果代表更加肯定的检验。
- #在建模人群绘制ROC
- trainroc <- roc(status~predict, data = traindata,smooth=F)
- plot(trainroc, print.auc=TRUE, print.thres=TRUE,main = "ROC curve in traindata",
- identity.lty=1,identity.lwd=1)
- auc(trainroc);ci(trainroc)
-
-
- #在验证人群中绘制ROC
- testroc <- roc(status~predict, data = testdata,smooth=F,direction=c(">"))
- plot(testroc, print.auc=TRUE, print.thres=TRUE,main = "ROC curve in testdata",
- identity.lty=1,identity.lwd=1)
- auc(testroc);ci(testroc)
- #ROC比较
- #参考:https://zhuanlan.zhihu.com/p/562605972
- testobj <- roc.test(trainroc, testroc)
- testobj
- #################
- #校准曲线
- #在建模人群中绘制,p<0.05说明校准度不好(注:如果一开始y变量因子化,需要在此处数字化)
- val.prob(traindata$predict,as.numeric(traindata$status))
- #在验证人群中绘制
- val.prob(testdata$predict,as.numeric(testdata$status))
- ###################DCA################################
- #临床决策曲线DCA
- library(rms)
- #install.packages("rmda")
- library(rmda)
- modul<- decision_curve(status~predict,data=traindata,
- family = binomial(link ='logit'),
- thresholds= seq(0,1, by = 0.01),
- confidence.intervals = 0.95)
- plot_decision_curve(modul,
- curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
- cost.benefit.axis =FALSE,col= "blue",
- confidence.intervals=FALSE,
- standardize = FALSE)
- ##测试集DCA
- modul<- decision_curve(status~predict,data=testdata,
- family = binomial(link ='logit'),
- thresholds= seq(0,1, by = 0.01),
- confidence.intervals = 0.95)
- plot_decision_curve(modul,
- curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
- cost.benefit.axis =FALSE,col= "blue",
- confidence.intervals=FALSE,
- standardize = FALSE)
########################################################
- #NRI
- #参考:https://mp.weixin.qq.com/s/Hx1B1HuN8un-LBnKw7Ro3Q
- #参考:https://mp.weixin.qq.com/s/geQOXGto-QWjz5ZscMgLEQ
- #参考:https://mp.weixin.qq.com/s/phsatwnC05S5-UHfxlxIAA
- #参考:https://mp.weixin.qq.com/s/ZBPYBPBZ4FVp4FkgsnvYoQ
- #install.packages("nricens")
- library(nricens)
- fit4<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata,x=TRUE)
- fit5<- glm(status~sex+age+pat.karno+meal.cal,family=binomial(link = "logit"),data = traindata,x=TRUE)
- NRI <- nribin(mdl.std = fit4, mdl.new = fit5,
- updown = 'category',cut = c(0.3,0.6),
- niter = 10000,alpha = 0.05)
##############不同模型的比较#######################
- #参考:R语言实战
- anova(fit4,fit5,test = "Chisq")
- #使用anova函数中的卡方检验进行比较,如果p>0.05,则说明两个模型拟合程度一样好。
##########################################################
- #NRI与IDI
- #参考:https://mp.weixin.qq.com/s/7CFyCaa97zzTTxI-hrfvWA
- #参考:https://mp.weixin.qq.com/s/W2EtlEUepOI8TZxNnhCoaw
- #install.packages("PredictABEL")
- library(PredictABEL)
- fit6<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata)
- fit7<- glm(status~sex+age+pat.karno+meal.cal,family=binomial(link = "logit"),data = traindata)
- predict6<-fit6$fitted.values
- predict7<-fit7$fitted.values
- reclassification(data = traindata,cOutcome = 1,predrisk1 = predict6,predrisk2 = predict7,
- cutoff = c(0,0.30,0.60,1))
- ##############################构建列线图##############################################
- #列线图
- #install.packages("rms")
- library(rms)
- ddist <- datadist(traindata)
- options(datadist="ddist")
- fit8<-lrm(status~age+sex+meal.cal,data=traindata,x=T,y=T)
- mynom<- nomogram(fit8, fun=plogis,fun.at=c(0.01,seq(0.1,0.9,by=0.1),0.99),lp=F, funlabel="Risk of disease")
- plot(mynom)
- ######################扩展################################
- #稳健logistic回归:当数据出现离群点和强影响时,可使用稳健logistic回归,使用robustbase包的glmRob()。
- #多项式logistic回归:使用mlogit包的mlogit()。
- #有序logistic回归:MASS包的pllyr()。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。