当前位置:   article > 正文

logistic回归模型(R代码大全)

对变量进行logitech分析r语言代码

logistic回归是最常用的多因素分析方法之一,主要应用在两个方面,一个是筛选独立的危险因素,另一个就是构建临床预测模型。我们把最常用的logistic回归相关的R代码进行了整理,相关的参考资料也给出了链接,方便大家进行进一步的阅读学习,希望能帮助大家更快的进行统计分析工作。

  1. # 使用软件 R-4.2.0
  2. #一、使用数据集
  3. library(survival)
  4. #data(lung) # 加载lung数据集
  5. mydata<-na.omit(survival::lung) #去除缺失值
  6. #View(mydata) # 查看数据集
  7. str(mydata)

5b86318574a06c31a6b76eb158862823.png

##########################################################

#二、数据字段说明

  1. #survival包的lung数据集
  2. #lung数据集:NCCTG晚期肺癌患者的生存率。
  3. #inst:机构代码;
  4. #time:生存天数(以天为单位的生存时间);
  5. #status:生存状态,1为删失,2为死亡;
  6. #age:年龄;sex # 性别,1为男性,2为女性;
  7. #ph.ecog、ph.karno、pat.karno # 为病人和患者评分;
  8. #ph.ecog:ECOG评分(0 =好,5 =死亡)
  9. #ph.karno:医师进行的Karnofsky评分(0 = 差,100 = 好)
  10. #pat.karno:患者自行进行的Karnofsky评分(0 = 差,100 = 好)
  11. #meal.cal:用餐时消耗的卡路里;
  12. #wt.loss:最近6个月的体重下降。

#三、去除inst和time两个变量

  1. mydata<-mydata[,-c(1,2)]
  2. mydata$status<-ifelse(mydata$status==2,1,0)
  3. #mydata$sex=ifelse(mydata$sex=='f',1,0)
  4. str(mydata)

c153f53e99796751ebfca1c0ec8a09b8.png

#以status为因变量,以其他变量为自变量构建logistic回归模型

################

#四、拆分数据集,训练集和测试集

  1. #install.packages("caret")
  2. library(caret)
  3. set.seed(300)
  4. trainid<-createDataPartition(y=mydata$status,p=0.70,list=F)
  5. traindata<-mydata[trainid, ]
  6. testdata<-mydata[-trainid,]

#四、快速统计描述

  1. #install.packages("PerformanceAnalytics")
  2. library(PerformanceAnalytics)
  3. chart.Correlation(mydata[,c(2,4:8)], histogram=TRUE,
  4. pch=19,method = 'spearman')

ba19fb7e097ffea11775f24f8d44675b.png

#五、快速组间比较

  1. #参考:https://mp.weixin.qq.com/s/9yY8UIyEVqkiLhHJrr0Row
  2. #install.packages("scitb")
  3. library(scitb)
  4. allVars<-c("age", "sex","ph.ecog","ph.karno", "pat.karno", "meal.cal", "wt.loss")
  5. fvars<-c("sex","ph.ecog")
  6. strata<-"status"
  7. table1<-scitb1(vars=allVars,fvars=fvars,strata=strata,data=mydata) #自动检验正态性
  8. table1<-scitb1(vars=allVars,fvars=fvars,strata=strata,data=mydata,
  9. nonnormal=c("meal.cal"),atotest=F) #指定非正态
  10. table1
  11. write.csv(table1,file= "1.csv",row.names = F) #导出table1

de892371706d9694f832b12e053df245.png

3dffb460cfc2634cfa6e1481e617b59d.png

########################

#四、基线特征描述统计

  1. #install.packages("autoReg")
  2. library(autoReg)
  3. library(dplyr)
  4. table1=gaze(status~.,data=mydata) %>% myft()
  5. table1
  6. table2=gaze(status~.,data=mydata,method=3) %>% myft()
  7. table2

8df9c0639a683a3df512b6e57cf3ac6d.png

784ed741697d3f66c3eee4876d377eb5.png

  1. #install.packages("rrtable")
  2. library(rrtable)
  3. table2pptx(table1) #Exported table as Report.pptx
  4. table2docx(table2) #Exported table as Report.docx

#六、批量单因素分析

  1. #多个变量因子化,运行autoReg命令前,把分类变量因子化
  2. #mydata[,c("sex","ph.ecog")]<-lapply(mydata[,c("sex","ph.ecog")],as.factor)
  3. mydata$sex.factor=factor(mydata$sex,labels=c("Female","Male"))
  4. mydata$ph.ecog.factor=factor(mydata$ph.ecog)
  5. fit<- glm(status~sex.factor+age+ph.ecog.factor+ph.karno+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),
  6. data = mydata)
  1. ##logistic回归结果快速整理
  2. # https://cloud.tencent.com/developer/article/2146471
  3. #install.packages("autoReg")
  4. library(autoReg)
  5. autoReg(fit,uni=TRUE,threshold=0.05)
  6. autoReg(fit)
  7. result<-autoReg(fit, uni=TRUE) %>% myft()
  8. result
  9. library(rrtable)
  10. table2pptx(coxresult) #Exported table as Report.pptx
  11. table2docx(coxresult) #Exported table as Report.docx

d7fc8a20c47ade307fb3620b02a72a30.png

  1. #对比一下结果
  2. fit2<-glm(status~sex.factor,family=binomial(link = "logit"),
  3. data = mydata)
  4. summary(fit2)
  5. exp(coefficients(fit2))

c1e12cebb226f0e99ce09ebeba827b7f.png

####临床预测模型评价指标

  1. #生成预测值
  2. fit3<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata)
  3. #在建模人群中计算预测值,生成新变量
  4. traindata$predict<- predict(newdata=traindata,fit3,"response")
  5. #在验证人群计算预测值,命名为predict
  6. testdata$predict<- predict(newdata=testdata,fit3,"response")
  1. #######ROC曲线下面积的计算#############################
  2. #参考:https://zhuanlan.zhihu.com/p/562605972
  3. #参考:https://mp.weixin.qq.com/s/mCwWuRK7gQy-79rgXi2ZRA
  4. library(pROC)
  5. #利用pROC绘制ROC要注意direction参数的设置,如果算出来auc<0.05,可以设置为controls>cases,
  6. #较小的指标结果代表更加肯定的检验。
  7. #在建模人群绘制ROC
  8. trainroc <- roc(status~predict, data = traindata,smooth=F)
  9. plot(trainroc, print.auc=TRUE, print.thres=TRUE,main = "ROC curve in traindata",
  10. identity.lty=1,identity.lwd=1)
  11. auc(trainroc);ci(trainroc)
  12. #在验证人群中绘制ROC
  13. testroc <- roc(status~predict, data = testdata,smooth=F,direction=c(">"))
  14. plot(testroc, print.auc=TRUE, print.thres=TRUE,main = "ROC curve in testdata",
  15. identity.lty=1,identity.lwd=1)
  16. auc(testroc);ci(testroc)
  17. #ROC比较
  18. #参考:https://zhuanlan.zhihu.com/p/562605972
  19. testobj <- roc.test(trainroc, testroc)
  20. testobj

078d203b6c80bd8734a6b83028ecc3a0.png

08296ce9bcce2641e5f4af8c6255d7ac.png

23d947eb990d96dc631e3510056b2585.png

  1. #################
  2. #校准曲线
  3. #在建模人群中绘制,p<0.05说明校准度不好(注:如果一开始y变量因子化,需要在此处数字化)
  4. val.prob(traindata$predict,as.numeric(traindata$status))
  5. #在验证人群中绘制
  6. val.prob(testdata$predict,as.numeric(testdata$status))

10354a4fa06ad3a0216a5663ab64a709.png

  1. ###################DCA################################
  2. #临床决策曲线DCA
  3. library(rms)
  4. #install.packages("rmda")
  5. library(rmda)
  6. modul<- decision_curve(status~predict,data=traindata,
  7. family = binomial(link ='logit'),
  8. thresholds= seq(0,1, by = 0.01),
  9. confidence.intervals = 0.95)
  10. plot_decision_curve(modul,
  11. curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
  12. cost.benefit.axis =FALSE,col= "blue",
  13. confidence.intervals=FALSE,
  14. standardize = FALSE)
  15. ##测试集DCA
  16. modul<- decision_curve(status~predict,data=testdata,
  17. family = binomial(link ='logit'),
  18. thresholds= seq(0,1, by = 0.01),
  19. confidence.intervals = 0.95)
  20. plot_decision_curve(modul,
  21. curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
  22. cost.benefit.axis =FALSE,col= "blue",
  23. confidence.intervals=FALSE,
  24. standardize = FALSE)

d36ec8de4fd016593af8398a0552f7b8.png

########################################################

  1. #NRI
  2. #参考:https://mp.weixin.qq.com/s/Hx1B1HuN8un-LBnKw7Ro3Q
  3. #参考:https://mp.weixin.qq.com/s/geQOXGto-QWjz5ZscMgLEQ
  4. #参考:https://mp.weixin.qq.com/s/phsatwnC05S5-UHfxlxIAA
  5. #参考:https://mp.weixin.qq.com/s/ZBPYBPBZ4FVp4FkgsnvYoQ
  6. #install.packages("nricens")
  7. library(nricens)
  8. fit4<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata,x=TRUE)
  9. fit5<- glm(status~sex+age+pat.karno+meal.cal,family=binomial(link = "logit"),data = traindata,x=TRUE)
  10. NRI <- nribin(mdl.std = fit4, mdl.new = fit5,
  11. updown = 'category',cut = c(0.3,0.6),
  12. niter = 10000,alpha = 0.05)

b8ee2835a2099d6b8e7a343cb73de357.png

##############不同模型的比较#######################

  1. #参考:R语言实战
  2. anova(fit4,fit5,test = "Chisq")
  3. #使用anova函数中的卡方检验进行比较,如果p>0.05,则说明两个模型拟合程度一样好。

c42dd777220c54c142a80ddaf1a54dcd.png

##########################################################

  1. #NRI与IDI
  2. #参考:https://mp.weixin.qq.com/s/7CFyCaa97zzTTxI-hrfvWA
  3. #参考:https://mp.weixin.qq.com/s/W2EtlEUepOI8TZxNnhCoaw
  4. #install.packages("PredictABEL")
  5. library(PredictABEL)
  6. fit6<- glm(status~sex+age+pat.karno+meal.cal+wt.loss,family=binomial(link = "logit"),data = traindata)
  7. fit7<- glm(status~sex+age+pat.karno+meal.cal,family=binomial(link = "logit"),data = traindata)  
  8. predict6<-fit6$fitted.values
  9. predict7<-fit7$fitted.values
  10. reclassification(data = traindata,cOutcome = 1,predrisk1 = predict6,predrisk2 = predict7,
  11. cutoff = c(0,0.30,0.60,1))

85aeaa90298a8f7979defc42a22b1420.png

  1. ##############################构建列线图##############################################
  2. #列线图
  3. #install.packages("rms")
  4. library(rms)
  5. ddist <- datadist(traindata)
  6. options(datadist="ddist")
  7. fit8<-lrm(status~age+sex+meal.cal,data=traindata,x=T,y=T)
  8. mynom<- nomogram(fit8, fun=plogis,fun.at=c(0.01,seq(0.1,0.9,by=0.1),0.99),lp=F, funlabel="Risk of disease")
  9. plot(mynom)

cce2971b4020df5b3d7200d5ec0abcec.png

  1. ######################扩展################################
  2. #稳健logistic回归:当数据出现离群点和强影响时,可使用稳健logistic回归,使用robustbase包的glmRob()。
  3. #多项式logistic回归:使用mlogit包的mlogit()。
  4. #有序logistic回归:MASS包的pllyr()。
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/629913
推荐阅读
相关标签
  

闽ICP备14008679号