赞
踩
继续上文,这次先补充一个原文框架:
上文复现了DEG和WGCNA之后,与原文章有出入,最后我借用sangerbox工具取交集,选出210个gene进行LASSO和SVM分析,继续筛选核心基因,其实可以使用更多的机器学习办法,如随机森林和Xgboost。目前生信的工具很多,仙桃,GEPIA,大家活学活用就好。
- rm(list = ls())
- library(tidyr)
- library(dplyr)
- library(tibble)
- library(data.table)
- #############################数据准备#####################
- setwd("D:\\BioR\\new\\NAFLD_Bio\\03intersect")
- intergene <- read.csv('vennData.csv',header = T)
- load('D:\\BioR\\new\\NAFLD_Bio\\01preprocess\\ex_ma.Rdata')
- ex_ma2 <- rownames_to_column(ex_ma,'gene')
- interexpr <- ex_ma[ex_ma2$gene %in% intergene$gene,]
- ##(2)读取分组信息,来自GEO
- group <- fread("group.txt",data.table = F)
- library(stringr)
- ##观察分组信息,发现‘_’为分隔,采用split函数分开字符串,并取第二个
- group$group2=trimws(str_split(group$group,'_',simplify = T)[,2])
- table(group$group2)
- ##此处提取HC和SS的分组信息,grep和grepl都可以,grep是提取包含某一或某些字符的行
- ##提取group2列里包含HC或者SS的行
- groupf <-group[grep('HC|SS',group$group2),]
- save(groupf,file='D:\\BioR\\new\\NAFLD_Bio\\01preprocess\\group.Rdata')
-
- groupf <- groupf[,-2]
- interexpr2 <- as.data.frame(t(interexpr))
- interexpr2 <- rownames_to_column(interexpr2,'symbol')
- interexpr2 <- merge(interexpr2,groupf,by='symbol')
- interexpr2 <- interexpr2[,c(1,212,2:211)] ##调换列的顺序,把分组信息放到第二列
- interexpr2 <- interexpr2[,-1]
- interexpr2$group2 <- ifelse(interexpr2$group2=='SS',1,0)
-
- ############################机器学习筛选核心基因#############################
- library(randomForestSRC)
- library(tidyverse)
- library(ggplot2)
- library(ggvenn)
- library(glmnet)
-
- #####################LASSO#############
- rt <- interexpr2
- x=as.matrix(rt[,c(2:ncol(rt))])
- y=data.matrix(rt$group2)
-
- library(caret)
- library(VennDiagram)
- fit = glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)
- # 绘制LASSO回归曲线图
- pdf("1A_lasso.pdf", width = 30, height = 15)
- plot(fit, xvar = "dev", label = TRUE)
- dev.off()
- #绘制LASSO回归10折交叉验证图
- cvfit = cv.glmnet(x, y,
- nfold=10,
- family = "binomial", type.measure = "class")
- pdf("2cvfit.pdf")
- plot(cvfit)
- dev.off()
- #查看最佳lambda
- cvfit$lambda.min
- # 获取LASSO选出来的特征
- myCoefs <- coef(cvfit, s="lambda.min")
- lasso_fea <- myCoefs@Dimnames[[1]][which(myCoefs != 0 )]
- (lasso_fea <- lasso_fea[-1])
- # 把lasso找到的特征保存到文件
- write.csv(lasso_fea,"3feature_lasso.csv")
LASSO回归结果和原文差别不小,我注意到原文四个基因都在那210个基因中,但是LASSO只选出了6个,其中只有2个基因和原文一致,LASSO具体参数尚不清楚。
FOSB | MYC | C13orf15 | THBS1 | SOCS2 | RFXDC2 |
- ##############随机森林#############
- library(e1071)
- source("D:\\BioR\\new\\NAFLD_Bio\\03intersect\\msvmRFE.R")
- input <- interexpr2
- ##############################
- library(caret)
- library(VennDiagram)
- input <- interexpr2
- #采用五折交叉验证 (k-fold crossValidation)
- svmRFE(input, k = 5, halve.above = 100) #分割数据,分配随机数
- nfold = 5
- nrows = nrow(input)
- folds = rep(1:nfold, len=nrows)[sample(nrows)]
- folds = lapply(1:nfold, function(x) which(folds == x))
- results = lapply(folds, svmRFE.wrap, input, k=5, halve.above=100) #特征选择
- top.features = WriteFeatures(results, input, save=F) #查看主要变量
- head(top.features)
- #把SVM-REF找到的特征保存到文件
- write.csv(top.features,"4feature_svm.csv")
- # 选前300个变量进行SVM模型构建,选取的变量越多,运行速度越慢!
- featsweep = lapply(1:40, FeatSweep.wrap, results, input) #40个变量
- # 画图
- no.info = min(prop.table(table(input[,1])))
- errors = sapply(featsweep, function(x) ifelse(is.null(x), NA, x$error))
- #绘制基于SVM-REF算法的错误率曲线图
- pdf("5B_svm-error.pdf",width = 5,height = 5)
- PlotErrors(errors, no.info=no.info) #查看错误率
- dev.off()
- #绘制基于SVM-REF算法的正确率曲线图
- #dev.new(width=4, height=4, bg='white')
- pdf("6B_svm-accuracy.pdf",width = 5,height = 5)
- Plotaccuracy(1-errors,no.info=no.info) #查看准确率
- dev.off()
- # 图中红色圆圈所在的位置,即错误率最低点
- which.min(errors)
- #比较lasso和SVM-REF方法一找出的特征变量,画Venn图
- (myoverlap <- intersect(lasso_fea, top.features[1:which.min(errors), "FeatureName"])) #交集
- #统计交叉基因有多少个
- summary(lasso_fea%in%top.features[1:which.min(errors), "FeatureName"])
- #绘制venn图
- pdf("7C_lasso_SVM_venn.pdf", width = 15, height = 8)
- grid.newpage()
- venn.plot<-venn.diagram(list(LASSO=lasso_fea, SVM_RFE=as.character(top.features[1:which.min(errors),"FeatureName"])), NULL,
- fill = c("#E31A1C","#E7B800"),
- alpha = c(0.5,0.5), cex = 4, cat.fontface=3,
- category.names = c("LASSO", "SVM_RFE"),
- main = "Overlap")
- grid.draw(venn.plot)
- dev.off()
取前38个基因和LASSO取交集,只有SOCS2和FOSB和原文一致,SVM估计参数也不一样,恳请指正。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。