当前位置:   article > 正文

GEO芯片数据处理、多芯片数据合并、差异分析、核心基因提取、核心基因与免疫细胞相关性_如何查询肿瘤免疫细胞浸润的gse数据

如何查询肿瘤免疫细胞浸润的gse数据

R:https://www.r-project.org/
Rstudio:  https://posit.co/products/open-source/rstudio/

一.数据准备

  1. 下载探针矩阵probe.txt和平台注释ann.txt

部分平台注释文件不能下载,解决办法:

  • gset <- getGEO("GSE149507",destdir = ".",getGPL = T)→gset[["GSE149507_series_matrix.txt.gz"]]@featureData@data
  • 下载soft文件:soft <-getGEO(filename="GSE149507_family.soft")→soft@gpls[["GPL23270"]]@dataTable@table#getGEO()可下载网页文件,也可读取soft文件

二. 探针矩阵转换为基因矩阵

  1. !setwd("D:\\immune cell infiltration\\GEO")
  2. getwd()
  3. 2.在GEO数据库中下载GSE信息
  4. #install.packages("tidyverse")
  5. library(GEOquery)
  6. !gset <- getGEO(GEO = "GSE124226",destdir = ".",getGPL = F)#destdir="."表示下载到当前路径
  7. e2 <- gset[[1]]
  8. phe <- e2@phenoData@data #提取临床信息
  9. exp <- e2@assayData[["exprs"]] #提取表达矩阵
  10. 3.在GEO数据库中下载对应平台信息:(平台信息含探针对应的基因名) 该处ID_symbol处需替换
  11. library(data.table)
  12. ann=fread("ann.txt",sep="\t",header=T,data.table = F)#能在excel中打开的文件都可以用fread函数读取
  13. ID_symbol <- ann[,c("ID","Gene Symbol")]
  14. 4.部分探针对应的基因名有两个(如"DDR1 /// MIR4640" ),将排在后面的去掉 该处ID_symbol、ID_gene_symbol处需替换
  15. symbol <- ID_symbol$`Gene Symbol`
  16. symbol1 <- strsplit(symbol,split=" /// ",fixed=T)#fixed=T表示精确查找
  17. gene_symbol <- sapply(symbol1,function(x){x[1]})#提取每一个子列表中的第一个元素
  18. ID_gene_symbol <- data.frame(ID_symbol$ID,gene_symbol)
  19. rm(ID_symbol,symbol,symbol1,gene_symbol)
  20. 5.将表达矩阵与基因名合并
  21. library(dplyr)
  22. exp <- as.data.frame(exp)#merge函数只对两个data.frame进行合并
  23. exp1 <- merge(ID_gene_symbol,exp,by.x = 1,by.y = 0)
  24. exp2 <- distinct(exp1,gene_symbol,.keep_all=T)
  25. exp3 <- na.omit(exp2)#将缺失的基因名去除
  26. rownames(exp3) <- exp3$gene_symbol
  27. exp4 <- exp3[,-c(1,2)]
  28. 6.将对照组样本放在前面并进行批次校正
  29. !control <- c()#对照样本所在列
  30. !treat <- c(seq(1,35,2))##查看tumor样本所在列
  31. exp5 <- exp4[,c(control,treat)]#将对照样本放在前面
  32. !group_list <- c(rep("control",18),rep("treat",18))
  33. group_list <- factor(group_list)
  34. group_list <- relevel(group_list,ref="control")#强制限定顺序,control在前
  35. library(limma)
  36. exp6 <- normalizeBetweenArrays(exp5)#除去批次效应
  37. boxplot(exp5,outline=F,notch=T,col=group_list,las=2)
  38. boxplot(exp6,outline=F,notch=T,col=group_list,las=2)#绘制出的图形相同,说明样本已经进行过归一化处理
  39. rm(ID_gene_symbol,exp,exp1,exp2,exp3,exp4,exp5,control,treat)
  40. 至此,我们获得了想要的横坐标为基因名,纵坐标为样本名的表达矩阵exp6

查看数据是否已经取了log2

  1. ex <- exp6
  2. qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
  3. LogC <-(qx[5]>100)||
  4. (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  5. (qx[2]>0&&qx[2]<1&&qx[4]>1&&qx[4]<2)
  6. if(LogC){
  7. ex[which(ex<=0)]<-NaN
  8. exprSet <- log2(ex)
  9. print("需要取log2")}else{print("无需取log2")}
  10. rm(ex,exp6,LogC,qx)
  11. exp6 <- exprSet
  12. write.csv(exp6,"GSE34526.NORMALIZE.CSV")

三、多芯片数据合并

  • 准备处理好的多个芯片数据
  1. setwd("D:\\hebing")
  2. getwd()
  3. BiocManager::install("sva")
  4. # 安装包
  5. install.packages("FactoMineR")
  6. install.packages("factoextra")
  7. library(sva)
  8. library(FactoMineR)
  9. library(factoextra)
  10. GSE1=read.csv(file = "GSE34526.NORMALIZE.csv",header = T,row.names = 1)
  11. GSE2=read.csv(file = "GSE137684.NORMALIZE.csv",header = T,row.names = 1)
  12. GSE3=read.csv(file = "GSE80432.NORMALIZE.csv",header = T,row.names = 1)
  13. #GEO数据数据合并
  14. name1 <- intersect(rownames(GSE1),rownames(GSE2))
  15. expr <- cbind(GSE1[name1,],GSE2[name1,])
  16. #若是多个数据(>2),则执行如下命令
  17. name1 <- intersect(rownames(GSE1),rownames(GSE2))
  18. name2 <- intersect(name1,rownames(GSE3))
  19. expr <- cbind(GSE1[name2,],GSE2[name2,],GSE3[name2,])
  20. batch <- c(rep("GSE5090",17),rep("GSE43264",15),rep("GSE6798",29),rep("GSE34526",10),rep("GSE10946",23))
  21. tissue <- c(rep(c("control","treat","control","treat","control","treat","control","treat","control","treat"),c(8,9,7,8,13,16,3,7,11,12)))
  22. mode <- model.matrix(~tissue)
  23. ########combat去除批次效应
  24. combat_expr <- ComBat(dat = expr,
  25. batch = batch,
  26. mod = mode)
  27. #combat_expr就是校正后且合并后的GEO数据
  28. ####removeBatchEffect去除批次效应
  29. limma_expr <- removeBatchEffect(expr,batch = batch,design = mode)
  30. rm(expr,GSE1,GSE2,GSE3,name1,name2)
  31. #矫正前PCA分析
  32. pre.pca <- PCA(t(expr),graph = FALSE)#expr1与前文对应
  33. fviz_pca_ind(pre.pca,
  34. geom= "point",
  35. col.ind = batch,#batch与前文对应
  36. addEllipses = TRUE,#添加椭圆
  37. legend.title="Group" )#图例名为Group
  38. #此处保存一下图片,方便后续对比
  39. #矫正后PCA分析(1和2都运行,选择百分数值较小的一个进行后续分析)
  40. #1
  41. combat.pca <- PCA(t(combat_expr),graph = FALSE)
  42. fviz_pca_ind(combat.pca,
  43. geom= "point",
  44. col.ind = batch,
  45. addEllipses = TRUE,
  46. legend.title="Group" )
  47. #2或
  48. combat.pca <- PCA(t(limma_expr),graph = FALSE)
  49. fviz_pca_ind(combat.pca,
  50. geom= "point",
  51. col.ind = batch,
  52. addEllipses = TRUE,
  53. legend.title="Group" )
  54. write.csv(limma_expr,"limma_expr.csv")
  55. write.csv(combat_expr,"combat_expr.csv")
  56. #箱线图查看前后对比
  57. par(mfrow=c(1,2))
  58. boxplot(as.data.frame(expr),main="Original")
  59. boxplot(as.data.frame(combat_expr),main="Batch corrected")
  60. 选取较好的矫正方法进行后续分析

 最终得到combat_expr.csv或limma_expr.csv,文件为合并后的GEO数据(行名样本名,列名基因名),样本名顺序按GSE1,GSE2,GSE3排列,每个GSE中对照组样本在前,疾病组样本在后

 四、将多芯片数据整理为对照组在前,疾病组样本在后

  1. tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
  2. setwd("C:\\Users\\LENOVO\\Desktop\\wgcna")
  3. limma_expr <- read.table("limma_expr.csv",sep=",",header=T,row.names = 1)
  4. data <- cbind(t(limma_expr),tissue)
  5. conData <- subset(data,tissue=="control")
  6. treatData <- subset(data,tissue=="treat")
  7. conNum <- nrow(conData)
  8. treatNum <- nrow(treatData)
  9. data=rbind(conData,treatData)
  10. data <- t(data[,-15068])
  11. write.csv(data,"limma_expr_control+treat.csv")
  12. rm(conData,treatData,limma_expr)
  13. fenzu=data.frame(Normal=c(rep(1,conNum),rep(0,treatNum)),
  14. Tumor=c(rep(0,conNum),rep(1,treatNum)))
  15. row.names(fenzu)=colnames(data)
  16. write.csv(fenzu,"fenzu.csv")#行名样品名,列名Normal和Tumor,是用1表示,否用0表示,分组也可用作临床信息

 最终得到data(limma_expr_control+teart.csv),fenzu.csv

五、差异分析(第六节)

  1. library(limma)
  2. exprSet <- read.csv("limma_expr_control+treat.csv",row.names = 1)
  3. #构建分组矩阵
  4. fenzu <- read.csv("fenzu.csv",row.names = 1)
  5. table(fenzu)
  6. group <- c(rep("con",15),rep("treat",23))
  7. group <- factor(group,levels = c("con","treat"))
  8. design <- model.matrix(~0+group)
  9. colnames(design) <- levels(group)
  10. design
  11. ### 2.构建比较矩阵
  12. contrast.matrix <- makeContrasts(treat - con,levels=design)
  13. #线性拟合模型构建
  14. fit <- lmFit(exprSet,design)#线性模型拟合
  15. fit <- contrasts.fit(fit, contrast.matrix)
  16. fit <- eBayes(fit)#贝叶斯检验
  17. ### 4.最终得到差异分析结果
  18. allDiff=topTable(fit,number=Inf)
  19. #输出所有基因的差异情况
  20. write.table(allDiff,"allDiff.txt",sep="\t",col.names = NA)
  21. #write.csv(allDiff,"allDiff.csv")
  22. #基因上调和下调设定
  23. data <- allDiff
  24. data$significant <- "stable"
  25. data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
  26. data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"
  27. #输出差异结果
  28. diffSig=data[with(data,(abs(logFC)>1 & P.Value<0.05)),]
  29. write.table(diffSig,"diff_1_0.05.txt",sep="\t",col.names = NA,quote=F)
  30. #筛选差异基因(基础表达量高、变化明显、p值显著)
  31. select.log2FC <- abs(data$logFC)>1
  32. select.Pvalue <- data$P.Value<0.05
  33. select.vec <- select.log2FC & select.Pvalue
  34. table(select.vec)
  35. degs.list <- as.character(rownames(data))[select.vec] #筛选出的基因名
  36. write.table(degs.list,"DEG.txt",sep="\t",quote=F,row.names=F,col.names = F)

图形绘制

4.1 火山图

图形绘制

  1. library(ggplot2)
  2. library(ggrepel)
  3. #ggplot中的每一个+号代表加一个参数的内容
  4. pdf("volcano.pdf",width = 6,height = 5)
  5. ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-2,2)+ylim(0,5)+ #x轴为logFC,p值越小,y值越大
  6. geom_point(aes(color=significant),size=0.8)+theme_classic()+ #画散点图,size代表点的大小,颜色分类按照significant
  7. scale_color_manual(values = c("green","black","red"))+ #点的颜色
  8. geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+ #垂直于y轴的线
  9. geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+ #垂直于x轴的线
  10. theme(title = element_text(size=18),text = element_text(size=18))+
  11. labs(x="log2(foldchange)",y="-log10(P_Value)")
  12. dev.off()

在火山图中标注基因名称

  1. setwd("D:\\limma_expr\\limma差异分析")
  2. BiocManager::install("ggrepel")
  3. library(ggrepel)
  4. data <- read.table("allDiff.txt",sep="\t",header=T,row.names = 1)
  5. data$significant <- "stable"
  6. data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
  7. data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"
  8. label.deg <- c("SYK","FCGR3A","TLR4",
  9. "TYROBP","FCER1G","CD4","ITGAM","PLCG2","ITGB2","VAV1")
  10. p=ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-1.5,2)+ylim(0,5)+
  11. geom_point(aes(color=significant),size=0.8)+theme_classic()+
  12. scale_color_manual(values = c("green","black","red"))+
  13. geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+
  14. geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+
  15. theme(title = element_text(size=18),text = element_text(size=18))+
  16. labs(x="log2(foldchange)",y="-log10(P_Value)")
  17. data_selected <- data[label.deg,] #筛选出的基因所在行
  18. p + # 基于普通火山图p
  19. geom_point(data = data_selected, # 添加强调显示的散点,仅限于上调基因
  20. aes(x = logFC, y = -log10(P.Value)),
  21. color = 'red3', size = 4.5, alpha = 0.2) + # 设置散点的颜色、大小和透明度
  22. geom_label_repel(data = data_selected, # 添加带边框的标签,仅限于上调基因
  23. aes(x = logFC, y = -log10(P.Value), label = rownames(data_selected)),
  24. seed = 233, # 设置随机数种子,用于确定标签位置
  25. size = 3.5, # 设置标签的字体大小
  26. color = 'black', # 设置标签的字体颜色
  27. min.segment.length = 0, # 始终为标签添加指引线段
  28. force = 2, # 设置标签重叠时的排斥力
  29. force_pull = 2, # 设置标签与数据点之间的吸引力
  30. box.padding = 0.4, # 设置标签周围的填充量
  31. max.overlaps = Inf) # 设置排斥重叠过多标签的阈值为无穷大,保持始终显示所有标签

4.2 热图

分组、选基因

  1. !annotation_col1 <- data.frame(
  2. database <- c(rep("GEO",36)),
  3. CellType <- c(rep("control",18),rep("treatment",18))
  4. )
  5. rownames(annotation_col1) <- colnames(exp6)
  6. class(exp6)
  7. exp6 <- as.data.frame(exp6)
  8. exprset.map <- exp6[label.deg,]#选取想要绘图的基因

绘图

  1. install.packages("pheatmap")
  2. library(pheatmap)
  3. pheatmap::pheatmap(exprset.map,#热图的数据
  4. cluster_rows = F,#行聚类(行间表达相似则聚类)
  5. cluster_cols=F,#列聚类,可以看出样本之间的区分度
  6. annotation_col = annotation_col1,
  7. show_colnames = F, #不展示样品名
  8. scale="row",#以行标准化
  9. color=colorRampPalette(c("blue","white","red"))(100))#100个渐变色

4.3 箱线图(某一个基因在对照组和疾病组中的差异)

  1. exp.x=exp6["TTC32",]
  2. z=t(exp.x)#行名为样本名,列为基因表达量
  3. z=as.data.frame(z)
  4. !z$type=c(rep("control",18),rep("treatment",18))
  5. library(ggpubr)
  6. library(ggsignif)
  7. library(ggplot2)
  8. ggboxplot(z,x="type",y="TTC32",
  9. width = 0.6,fill="type",
  10. notch = T,palette = c("#00AFBB", "red","#E7B800"),
  11. add = "jitter",shape="type")
  12. ###加个p值
  13. p=ggboxplot(z,x="type",y="TTC32",
  14. width = 0.6,fill="type",
  15. notch = T,palette = c("#00AFBB", "red","#E7B800"),
  16. add = "jitter",shape="type")
  17. p + stat_compare_means(aes(group =type))

若对照组和疾病组来自同一个人,可以在点与点之间连线

  1. !z$pairinfo=pairinfo=rep(1:18,2)1个和第19个样本来自同一个人
  2. !ggplot(z, aes(type,TTC32,fill=type)) +
  3. geom_boxplot() +
  4. geom_point(size=2, alpha=0.5) +
  5. geom_line(aes(group=pairinfo), colour="black", linetype="11") +
  6. xlab("") +
  7. ylab(paste("Expression of ","TTC32"))+
  8. theme_classic()+
  9. theme(legend.position = "none")

4.4 小提琴图

  1. library(ggsignif)
  2. compaired <- list(c("control", "treatment"))
  3. library(ggsci)
  4. ggplot(z,aes(type,TTC32,fill=type)) +
  5. geom_violin()+
  6. geom_signif(comparisons =compaired ,step_increase = 0.1,map_signif_level = c("***"=0.001, "**"=0.01, "*"=0.05),test =t.test,size=2,textsize = 6)+
  7. geom_jitter(shape=16, position=position_jitter(0.2))

六、 富集分析 (第九节)

5.1 

  1. #筛选差异基因(基础表达量高、变化明显、p值显著)
  2. select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
  3. select.log2FC <- abs(data$logFC)>1
  4. select.Pvalue <- data$P.Value<0.05
  5. select.vec <- select.FPKM & select.log2FC & select.Pvalue
  6. table(select.vec)
  7. degs.list <- as.character(rownames(data))[select.vec] #筛选出的基因名

5.1.1 GO

  1. ##BP、CC、MF改ont值即可
  2. library(org.Hs.eg.db)
  3. library(clusterProfiler)
  4. erich.go.BP = enrichGO(gene =degs.list,
  5. OrgDb = org.Hs.eg.db,
  6. keyType = "SYMBOL",
  7. ont = "BP",
  8. pvalueCutoff = 0.05,
  9. qvalueCutoff = 0.05)
  10. dotplot(erich.go.BP,showCategory = 8)#展示8个通路#横坐标为GeneRatio,代表差异基因占通路的百分比
  11. #或barplot(erich.go.BP,showCategory = 8)
  12. erich.go.BP=erich.go.BP@result#通路信息
  13. write.table(erich.go.BP,"erich.go.BP.deg.con.hf.txt",sep = "\t",col.names = NA)

 详细版

  1. setwd("")
  2. #install.packages("colorspace")
  3. #install.packages("stringi")
  4. #install.packages("ggplot2")
  5. #install.packages("circlize")
  6. #install.packages("RColorBrewer")
  7. #if (!requireNamespace("BiocManager", quietly = TRUE))
  8. # install.packages("BiocManager")
  9. #BiocManager::install("org.Hs.eg.db")
  10. #BiocManager::install("DOSE")
  11. #BiocManager::install("clusterProfiler")
  12. #BiocManager::install("enrichplot")
  13. #BiocManager::install("ComplexHeatmap")
  14. library(clusterProfiler)
  15. library(org.Hs.eg.db)
  16. library(enrichplot)
  17. library(ggplot2)
  18. library(circlize)
  19. library(RColorBrewer)
  20. library(dplyr)
  21. library(ComplexHeatmap)
  22. pvalueFilter=0.05 #p值过滤条件
  23. adjPvalFilter=0.05 #矫正后的p值过滤条件
  24. #定义颜色
  25. colorSel="p.adjust"
  26. if(adjPvalFilter>0.05){
  27. colorSel="pvalue"
  28. }
  29. rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F) #??ȡ?????ļ?
  30. #提取差异基因的名称,并将其转换为entrezID
  31. genes=unique(as.vector(rt[,1]))
  32. entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
  33. entrezIDs=as.character(entrezIDs)
  34. rt=cbind(rt, entrezIDs)
  35. rt=rt[rt[,"entrezIDs"]!="NA",] #去除entrezID为NA的基因
  36. gene=rt$entrezID
  37. #gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
  38. geneFC=rt$logFC
  39. names(geneFC)=gene
  40. #GO富集分析
  41. kk=enrichGO(gene=gene, OrgDb=org.Hs.eg.db, pvalueCutoff=1, qvalueCutoff=1, ont="all", readable=T)#ont="all"表示同时进行BP,CC,MF
  42. GO=as.data.frame(kk)
  43. GO=GO[(GO$pvalue<pvalueFilter & GO$p.adjust<adjPvalFilter),]
  44. write.table(GO, file="GO.txt", sep="\t", quote=F, row.names = F)
  45. #柱状图
  46. pdf(file="barplot.pdf", width=8.5, height=7)
  47. bar=barplot(kk, drop=TRUE, showCategory=10, label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
  48. print(bar)
  49. dev.off()
  50. #气泡图
  51. pdf(file="bubble.pdf", width=8.5, height=7)
  52. bub=dotplot(kk, showCategory=10, orderBy="GeneRatio", label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
  53. print(bub)
  54. dev.off()
  55. #基因和功能的关系图
  56. pdf(file="cnetplot.pdf", width=8, height=5.25)
  57. cnet=cnetplot(kk, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
  58. print(cnet)
  59. dev.off()
  60. ###########GO圈图#############################
  61. ontology.col=c("#00AFBB", "#E7B800", "#90EE90")
  62. data=GO[order(GO$pvalue),]
  63. datasig=data[data$pvalue<0.05,,drop=F]
  64. BP = datasig[datasig$ONTOLOGY=="BP",,drop=F]
  65. CC = datasig[datasig$ONTOLOGY=="CC",,drop=F]
  66. MF = datasig[datasig$ONTOLOGY=="MF",,drop=F]
  67. BP = head(BP,6)
  68. CC = head(CC,6)
  69. MF = head(MF,6)
  70. data = rbind(BP,CC,MF)
  71. main.col = ontology.col[as.numeric(as.factor(data$ONTOLOGY))]
  72. #整理圈图数据
  73. BgGene = as.numeric(sapply(strsplit(data$BgRatio,"/"),'[',1))
  74. Gene = as.numeric(sapply(strsplit(data$GeneRatio,'/'),'[',1))
  75. ratio = Gene/BgGene
  76. logpvalue = -log(data$pvalue,10)
  77. logpvalue.col = brewer.pal(n = 8, name = "Reds")
  78. f = colorRamp2(breaks = c(0,2,4,6,8,10,15,20), colors = logpvalue.col)
  79. BgGene.col = f(logpvalue)
  80. df = data.frame(GO=data$ID,start=1,end=max(BgGene))
  81. rownames(df) = df$GO
  82. bed2 = data.frame(GO=data$ID,start=1,end=BgGene,BgGene=BgGene,BgGene.col=BgGene.col)
  83. bed3 = data.frame(GO=data$ID,start=1,end=Gene,BgGene=Gene)
  84. bed4 = data.frame(GO=data$ID,start=1,end=max(BgGene),ratio=ratio,col=main.col)
  85. bed4$ratio = bed4$ratio/max(bed4$ratio)*9.5
  86. #绘制圈图主体部分
  87. pdf("GO.circlize.pdf",width=10,height=10)
  88. par(omi=c(0.1,0.1,0.1,1.5))
  89. circos.par(track.margin=c(0.01,0.01))
  90. circos.genomicInitialize(df,plotType="none")
  91. circos.trackPlotRegion(ylim = c(0, 1), panel.fun = function(x, y) {
  92. sector.index = get.cell.meta.data("sector.index")
  93. xlim = get.cell.meta.data("xlim")
  94. ylim = get.cell.meta.data("ylim")
  95. circos.text(mean(xlim), mean(ylim), sector.index, cex = 0.8, facing = "bending.inside", niceFacing = TRUE)
  96. }, track.height = 0.08, bg.border = NA,bg.col = main.col)
  97. for(si in get.all.sector.index()) {
  98. circos.axis(h = "top", labels.cex = 0.6, sector.index = si,track.index = 1,
  99. major.at=seq(0,max(BgGene),by=100),labels.facing = "clockwise")
  100. }
  101. f = colorRamp2(breaks = c(-1, 0, 1), colors = c("green", "black", "red"))
  102. circos.genomicTrack(bed2, ylim = c(0, 1),track.height = 0.1,bg.border="white",
  103. panel.fun = function(region, value, ...) {
  104. i = getI(...)
  105. circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = value[,2],
  106. border = NA, ...)
  107. circos.genomicText(region, value, y = 0.4, labels = value[,1], adj=0,cex=0.8,...)
  108. })
  109. circos.genomicTrack(bed3, ylim = c(0, 1),track.height = 0.1,bg.border="white",
  110. panel.fun = function(region, value, ...) {
  111. i = getI(...)
  112. circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = '#BA55D3',
  113. border = NA, ...)
  114. circos.genomicText(region, value, y = 0.4, labels = value[,1], cex=0.9,adj=0,...)
  115. })
  116. circos.genomicTrack(bed4, ylim = c(0, 10),track.height = 0.35,bg.border="white",bg.col="grey90",
  117. panel.fun = function(region, value, ...) {
  118. cell.xlim = get.cell.meta.data("cell.xlim")
  119. cell.ylim = get.cell.meta.data("cell.ylim")
  120. for(j in 1:9) {
  121. y = cell.ylim[1] + (cell.ylim[2]-cell.ylim[1])/10*j
  122. circos.lines(cell.xlim, c(y, y), col = "#FFFFFF", lwd = 0.3)
  123. }
  124. circos.genomicRect(region, value, ytop = 0, ybottom = value[,1], col = value[,2],
  125. border = NA, ...)
  126. #circos.genomicText(region, value, y = 0.3, labels = value[,1], ...)
  127. })
  128. circos.clear()
  129. #绘制圈图中间的图例
  130. middle.legend = Legend(
  131. labels = c('Number of Genes','Number of Select','Rich Factor(0-1)'),
  132. type="points",pch=c(15,15,17),legend_gp = gpar(col=c('pink','#BA55D3',ontology.col[1])),
  133. title="",nrow=3,size= unit(3, "mm")
  134. )
  135. circle_size = unit(1, "snpc")
  136. draw(middle.legend,x=circle_size*0.42)
  137. #绘制GO分类的图例
  138. main.legend = Legend(
  139. labels = c("Biological Process", "Cellular Component", "Molecular Function"), type="points",pch=15,
  140. legend_gp = gpar(col=ontology.col), title_position = "topcenter",
  141. title = "ONTOLOGY", nrow = 3,size = unit(3, "mm"),grid_height = unit(5, "mm"),
  142. grid_width = unit(5, "mm")
  143. )
  144. #绘制富集显著性pvalue的图例
  145. logp.legend = Legend(
  146. labels=c('(0,2]','(2,4]','(4,6]','(6,8]','(8,10]','(10,15]','(15,20]','>=20'),
  147. type="points",pch=16,legend_gp=gpar(col=logpvalue.col),title="-log10(Pvalue)",
  148. title_position = "topcenter",grid_height = unit(5, "mm"),grid_width = unit(5, "mm"),
  149. size = unit(3, "mm")
  150. )
  151. lgd = packLegend(main.legend,logp.legend)
  152. circle_size = unit(1, "snpc")
  153. print(circle_size)
  154. draw(lgd, x = circle_size*0.85, y=circle_size*0.55,just = "left")
  155. dev.off()

5.1.2 KEGG(需要将基因名称转换为entrez_id)

  1. DEG.entrez_id = mapIds(x = org.Hs.eg.db,
  2. keys = degs.list,
  3. keytype = "SYMBOL",
  4. column = "ENTREZID")
  5. erich.kegg.res <- enrichKEGG(gene = DEG.entrez_id,
  6. organism = "hsa",#组织名称,hsa表示人
  7. keyType = "kegg")
  8. dotplot(erich.kegg.res)
  9. zzh=erich.kegg.res@result
  10. write.table(zzh,"erich.kegg.res.6.19.txt",sep = "\t",col.names = NA)

详细版

  1. setwd("")
  2. #install.packages("colorspace")
  3. #install.packages("stringi")
  4. #install.packages("ggplot2")
  5. #if (!requireNamespace("BiocManager", quietly = TRUE))
  6. # install.packages("BiocManager")
  7. #BiocManager::install("org.Hs.eg.db")
  8. #BiocManager::install("DOSE")
  9. #BiocManager::install("clusterProfiler")
  10. #BiocManager::install("enrichplot")
  11. library("clusterProfiler")
  12. library("org.Hs.eg.db")
  13. library("enrichplot")
  14. library("ggplot2")
  15. pvalueFilter=0.05
  16. adjPvalFilter=0.05
  17. #定义图形的颜色
  18. colorSel="p.adjust"
  19. if(adjPvalFilter>0.05){
  20. colorSel="pvalue"
  21. }
  22. rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F) #??ȡ?????ļ?
  23. #提取差异基因名称,将基因名转换为entrezID
  24. genes=unique(as.vector(rt[,1]))
  25. entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
  26. entrezIDs=as.character(entrezIDs)
  27. rt=cbind(rt, entrezIDs)
  28. rt=rt[rt[,"entrezIDs"]!="NA",]
  29. gene=rt$entrezID
  30. #gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
  31. geneFC=rt$logFC
  32. names(geneFC)=gene
  33. #kegg富集分析
  34. kk <- enrichKEGG(gene=gene, organism="hsa", pvalueCutoff=1, qvalueCutoff=1)
  35. KEGG=as.data.frame(kk)
  36. KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$genes[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/")))
  37. KEGG=KEGG[(KEGG$pvalue<pvalueFilter & KEGG$p.adjust<adjPvalFilter),]
  38. #输出显著富集的结果
  39. write.table(KEGG, file="KEGG.txt", sep="\t", quote=F, row.names = F)
  40. #定义显示通路的数目
  41. showNum=30
  42. if(nrow(KEGG)<showNum){
  43. showNum=nrow(KEGG)
  44. }
  45. #柱状图
  46. pdf(file="barplot.pdf", width=8, height=7)
  47. barplot(kk, drop=TRUE, showCategory=showNum, label_format=100, color=colorSel)
  48. dev.off()
  49. #气泡图
  50. pdf(file="bubble.pdf", width=8, height=7)
  51. dotplot(kk, showCategory=showNum, orderBy="GeneRatio", label_format=100, color=colorSel)
  52. dev.off()
  53. #基因和通路的关系图
  54. pdf(file="cnetplot.pdf", width=9, height=5.25)
  55. kkx=setReadable(kk, 'org.Hs.eg.db', 'ENTREZID')
  56. cnet=cnetplot(kkx, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
  57. print(cnet)
  58. dev.off()

5.1.3  挑选感兴趣的通路进行绘图(以erich.kegg.res.6.19.txt为例)

  • 用excel打开文档,复制粘贴感兴趣的通路包含列名到新文档1.27.txt,打开新的Rstudio
  1. options(stringsAsFactors = FALSE)
  2. rm(list=ls())
  3. setwd("D:\\immune cell infiltration\\GEO")
  4. kegg=read.table("1.27.txt",header = T,sep = "\t")
  5. k = data.frame(kegg)
  6. library(ggplot2)
  7. library(dplyr)
  8. #将GeneRatio转换为小数点样式
  9. x=k$GeneRatio
  10. x1=as.character(x)
  11. a=strsplit(x1,split="/",fixed=T)
  12. be=sapply(a,function(x){x[1]})
  13. be1=as.numeric(be)
  14. after=sapply(a,function(x){x[2]})
  15. after=as.numeric(after)
  16. ?strsplit
  17. k$GeneRatio = be1/after
  18. rm(x,x1,a,be,be1,after)
  19. #或者
  20. before <- as.numeric(sub("/\\d+$",replacement = "", k$GeneRatio))
  21. after <- as.numeric(sub("^\\d+/", "", k$GeneRatio))
  22. font.size =10
  23. k %>%
  24. ## 对进行p值排序
  25. arrange(p.adjust) %>%
  26. ##指定富集的通路数目(10个)
  27. slice(1:10) %>%
  28. ## 开始ggplot2 作图,其中fct_reorder调整因子level的顺序
  29. ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+
  30. ## 画出点图,size=Count代表气泡越大,count数越高
  31. geom_point(aes(color=p.adjust, size = Count)) +
  32. ## 调整颜色,guide_colorbar调整色图的方向
  33. scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
  34. ## 调整泡泡的大小
  35. scale_size_continuous(range=c(3, 8))+
  36. ## 如果用ylab("")或出现左侧空白
  37. labs(y=NULL) +
  38. ## 如果没有这一句,上方会到顶
  39. ggtitle("")+
  40. ## 设定主题
  41. theme_bw() +
  42. theme(axis.text.x = element_text(colour = "black",
  43. size = font.size, vjust =1 ),
  44. axis.text.y = element_text(colour = "black",
  45. size = font.size, hjust =1 ),
  46. axis.title = element_text(margin=margin(10, 5, 0, 0),
  47. color = "black",size = font.size),
  48. axis.title.y = element_text(angle=90))

5.2 GSEA

法一

  1. select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
  2. table(select.FPKM)
  3. gs.list <- as.character(rownames(data))[select.FPKM] #筛选出的基因名
  4. f1=data[gs.list,]
  5. f1$symbol <- rownames(f1)
  6. exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
  7. rm(f1)
  8. ###id转化
  9. expm.id <- bitr(exp.fc$symbol, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
  10. exp.fc.id <- merge(exp.fc, expm.id,by.x="symbol", by.y="SYMBOL", all=F)
  11. exp.fc.id=na.omit(exp.fc.id)
  12. exp.fc.id.sorted <- exp.fc.id[order(exp.fc.id$logFC, decreasing = T),]
  13. id.fc <- exp.fc.id.sorted$logFC
  14. names(id.fc) <- exp.fc.id.sorted$ENTREZID
  15. head(id.fc)
  16. rm(expm.id,exp.fc.id,exp.fc.id.sorted)
  17. gsea <- gseKEGG(id.fc, organism = "hsa",pvalueCutoff = 0.5)
  18. go_result <- gseGO(geneList = id.fc,
  19. ont = "BP",
  20. OrgDb = org.Hs.eg.db,
  21. pvalueCutoff = 0.5)
  22. ####查看一下富集结果(NES绝对值大于1.5、p.adjust<0.05富集效果较好)
  23. View(go_result@result)
  24. gseaplot2(gsea, geneSetID = 1,title = "abc")#绘制gsea中的第一条通路
  25. gseaplot2(gsea, geneSetID = c(2,3))#第二和第三条通路

法二

  1. select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
  2. table(select.FPKM)
  3. gs.list <- as.character(rownames(data))[select.FPKM] #筛选出的基因名
  4. f1=data[gs.list,]
  5. f1$symbol <- rownames(f1)
  6. f1 <- distinct(f1,symbol,.keep_all = T)
  7. exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
  8. rm(f1)
  9. deg = exp.fc$logFC
  10. names(deg) = exp.fc$symbol
  11. deg= sort(deg,decreasing = T)
  12. head(deg)
  13. install.packages("msigdbr")
  14. library(msigdbr)
  15. h.kegg= msigdbr(species ="Homo sapiens",category = "C2",subcategory = "KEGG")
  16. length(unique(h.kegg$gs_exact_source))
  17. h.kegg= h.kegg[,c(3,4)]
  18. gsea <- GSEA(deg,pvalueCutoff = 0.5, TERM2GENE =h.kegg)
  19. gseaplot2(gsea, geneSetID = 6, title = gsea$Description[6])
  20. gsea1=gsea@result
  21. write.table(gsea1,"gsea.res.txt",sep = "\t",col.names = NA)

七、核心基因(cytohubba12个打分类型的交集基因)

  • 利用cytoscape(插件为cytoHubba):12个打分类型,每个打分类型分别找出打分最高的基因,取交集所得基因即为核心基因,若找不到核心基因,则删掉几个打分类型
  • 准备输入文件:从string网站获得的网络关系文件
  • 分析完成后得到评分文件score.csv
  1. install.packages("UpSetR")
  2. library(UpSetR)
  3. setwd("D:\\immune cell infiltration\\multiarray")
  4. rt=read.csv("score.csv", header=T, sep=",", check.names=F, row.names=1)
  5. colnames(rt)=c(colnames(rt)[2:ncol(rt)], "N")
  6. scoreType=c("MCC", "DMNC", "MNC", "Degree", "EPC","BottleNeck", "EcCentricity","Closeness", "Radiality", "Betweenness", "Stress","ClusteringCoefficient")
  7. #对打分进行排序
  8. upsetList=c()
  9. for(i in scoreType){
  10. data=rt[order(rt[,i],decreasing=T),]
  11. hubGene=row.names(data)[1:50] #筛选打分最高的前50个基因
  12. upsetList[[i]]=hubGene
  13. }
  14. upsetData=fromList(upsetList)
  15. #输出图形
  16. pdf(file="upset.pdf", width=9, height=6, onefile=FALSE)
  17. upset(upsetData,
  18. nsets = length(scoreType), #展示打分类型个数
  19. nintersects = 50, #展示基因集数目
  20. order.by = "freq", #排序的方式(按照基因数目排序)
  21. show.numbers = "yes", #ֵ柱状图上方是否显示数值
  22. number.angles = 20, #字体角度
  23. point.size = 1.5, #点的大小
  24. matrix.color="red", #点的颜色
  25. line.size = 0.8, #线条粗细
  26. mainbar.y.label = "Gene Intersections",
  27. sets.x.label = "Set Size")
  28. dev.off()
  29. #获取交集基因,作为网络的核心基因
  30. intersectGenes=Reduce(intersect, upsetList)
  31. intersectGenes=as.data.frame(intersectGenes)
  32. colnames(intersectGenes)="Gene"
  33. write.csv(file="hubGenes.csv", intersectGenes, quote=F, row.names=F)

核心基因可视化

热图

  1. #if (!requireNamespace("BiocManager", quietly = TRUE))
  2. # install.packages("BiocManager")
  3. #BiocManager::install("limma")
  4. #install.packages("pheatmap")
  5. library(limma)
  6. library(pheatmap)
  7. setwd("")
  8. exp=read.table("exp6.csv", header=T, sep=",", check.names=F,row.names = 1)
  9. dimnames=list(rownames(exp),colnames(exp))
  10. data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
  11. data=avereps(data)
  12. geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
  13. data=data[as.vector(geneRT[,1]),]#核心基因的表达矩阵
  14. annotation_col1 <- data.frame(
  15. database <- c(rep("GEO124226",8)),
  16. Type <- c(rep("control",4),rep("treatment",4))
  17. )
  18. rownames(annotation_col1) <- colnames(data)
  19. colnames(annotation_col1) <- c("database","Type")
  20. #绘制核心基因热图
  21. pdf(file="hubGene.heatmap.pdf", width=7.5, height=3.5)
  22. pheatmap(data,
  23. annotation_col =annotation_col1,
  24. color = colorRampPalette(c(rep("blue",2), "white", rep("red",2)))(50),
  25. cluster_cols =F,
  26. show_colnames = F,
  27. scale="row",
  28. fontsize = 8,
  29. fontsize_row=7,
  30. fontsize_col=8)
  31. dev.off()

7.1 核心基因绘制ROC曲线

  1. #install.packages("glmnet")
  2. #install.packages("pROC")
  3. library(glmnet)
  4. library(pROC)
  5. expFile="limma_expr_control+treat.csv"
  6. geneFile="cytohubba_0.9_10.csv" #核心基因
  7. setwd("D:\\limma_expr\\WGCNA1")
  8. rt=read.table(expFile, header=T, sep=",", check.names=F, row.names=1)
  9. geneRT=read.table(geneFile, header=T, sep=",", check.names=F)
  10. geneRT <- data.frame(geneRT$Name)
  11. #获取表达矩阵样品信息
  12. y <- c(rep(0,15),rep(1,23))
  13. #对核心基因进行循环,绘制ROC曲线
  14. bioCol=rainbow(nrow(geneRT), s=0.9, v=0.9) #定义图形颜色
  15. aucText=c()
  16. k=0
  17. for(x in as.vector(geneRT[,1])){
  18. k=k+1
  19. 绘制ROC曲线
  20. roc1=roc(y, as.numeric(rt[x,])) #得到ROC曲线的参数
  21. if(k==1){
  22. pdf(file="ROC.genes.pdf", width=5, height=4.75)
  23. plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="")
  24. aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
  25. }else{
  26. plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="", add=TRUE)
  27. aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
  28. }
  29. }
  30. #绘制图例,得到ROC曲线下面积
  31. legend("bottomright", aucText, lwd=2, bty="n", col=bioCol[1:(ncol(rt)-1)])
  32. dev.off()
  33. #构建逻辑模型
  34. rt=rt[as.vector(geneRT[,1]),] #提取核心基因表达量
  35. rt=as.data.frame(t(rt))
  36. logit=glm(y ~ ., family=binomial(link='logit'), data=rt)
  37. pred=predict(logit, newx=rt) #得到模型的打分
  38. #绘制模型的ROC曲线
  39. roc1=roc(y, as.numeric(pred)) #得到模型ROC曲线的参数
  40. ci1=ci.auc(roc1, method="bootstrap") #ROC曲线下面积的波动范围
  41. ciVec=as.numeric(ci1)
  42. pdf(file="ROC.model.pdf", width=5, height=4.75)
  43. plot(roc1, print.auc=TRUE, col="red", legacy.axes=T, main="Model")
  44. text(0.39, 0.43, paste0("95% CI: ",sprintf("%.03f",ciVec[1]),"-",sprintf("%.03f",ciVec[3])), col="red")
  45. dev.off()

八、免疫细胞浸润分析与核心基因联合分析

准备矫正后的合并文件combat_expr.csv或limma_expr.csv
cytoscape评分结果文件hubGenes.csv,免疫细胞结果文件cibersort_result.txt和Diff.cell.txt

  1. #if (!requireNamespace("BiocManager", quietly = TRUE))
  2. # install.packages("BiocManager")
  3. #BiocManager::install("limma")
  4. #install.packages("tidyverse")
  5. #install.packages("ggplot2")
  6. #install.packages("reshape2")
  7. #install.packages("ggpubr")
  8. install.packages("ggExtra")
  9. #install.packages("corrplot")
  10. library(limma)
  11. library(reshape2)
  12. library(tidyverse)
  13. library(ggplot2)
  14. library(ggpubr)
  15. library(ggExtra)
  16. library(corrplot)
  17. corFilter=0.4 #相关系数过滤文件
  18. pvalueFilter=0.001 #相关性检验pvalue过滤文件
  19. exp=read.table("limma_expr.csv", header=T, sep=",", check.names=F,row.names = 1)
  20. exp=as.matrix(exp)
  21. dimnames=list(rownames(exp),colnames(exp))
  22. data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
  23. data=avereps(data)
  24. #提取核心基因的表达量
  25. geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
  26. data=data[as.vector(geneRT[,1]),]#行名基因名,列名样本名
  27. #去除对照组样品
  28. group <- data.frame(sample=character(38),group=character(38))
  29. group$sample <- colnames(exp)
  30. tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
  31. group$group <- tissue
  32. data <- cbind(t(data),group)
  33. data=data[data$group=="treat",]#行名为样本名,列名为基因名
  34. data <- data[,-c(13,14)]
  35. data <- as.matrix(data)
  36. #读取核心免疫细胞的含量
  37. immune=read.table("cibersort_result.txt", header=T, sep="\t", check.names=F, row.names=1)
  38. cellRT=read.csv("Diff.cell.txt", header=F, sep="\t", check.names=F)
  39. immune=immune[,as.vector(cellRT[,1])]#行名样本名,列名为免疫细胞名
  40. #提取基因表达矩阵和免疫浸润结果文件中都包含的样本信息
  41. sameSample=intersect(row.names(data), row.names(immune))
  42. data=data[sameSample,,drop=F]
  43. immune=immune[sameSample,,drop=F]#都只含实验组样品
  44. #相关性分析
  45. outTab=data.frame()
  46. #对免疫细胞进行循环
  47. for(cell in colnames(immune)){
  48. if(sd(immune[,cell])==0){next}
  49. #对核心基因进行循环
  50. for(gene in colnames(data)){
  51. x=as.numeric(data[,gene])
  52. y=as.numeric(immune[,cell])
  53. corT=cor.test(x,y,method="spearman")
  54. cor=corT$estimate
  55. pvalue=corT$p.value
  56. #对满足条件的免疫细胞进行可视化,绘制相关性图形
  57. if((abs(cor)>corFilter) & (pvalue<pvalueFilter)){
  58. df1=as.data.frame(cbind(x,y))
  59. p1=ggplot(df1, aes(x, y)) +
  60. xlab(paste0(gene, " expression")) + ylab(cell)+
  61. geom_point() + geom_smooth(method="lm",formula = y ~ x) + theme_bw()+
  62. stat_cor(method = 'spearman', aes(x =x, y =y))
  63. #相关性图形
  64. pdf(file=paste0("cor.", gene, "_", cell, ".pdf"), width=5, height=4.6)
  65. print(p1)
  66. dev.off()
  67. }
  68. }
  69. }
  70. #相关性热图ͼ
  71. data=cbind(data, immune)#行名为实验组样品名,列名为基因名和免疫细胞名
  72. write.csv(data,"data")
  73. a <- read.csv("data",header=T,row.names = 1)
  74. pdf("corHeatmap.pdf", width=9, height=9)
  75. corrplot(corr=cor(a),
  76. method = "color", #图形的展示形式
  77. order = "original", #免疫细胞的排序方式
  78. tl.col="black", #字体颜色
  79. addCoef.col = "black", #相关系数字体颜色
  80. number.cex = 0.8, #相关系数字体大小
  81. col=colorRampPalette(c("blue", "white", "red"))(50), #ͼ????ɫ??????
  82. )
  83. dev.off()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/天景科技苑/article/detail/860003
推荐阅读
相关标签
  

闽ICP备14008679号