赞
踩
R:https://www.r-project.org/
Rstudio: https://posit.co/products/open-source/rstudio/
下载探针矩阵probe.txt和平台注释ann.txt
部分平台注释文件不能下载,解决办法:
- !setwd("D:\\immune cell infiltration\\GEO")
- getwd()
- 2.在GEO数据库中下载GSE信息
- #install.packages("tidyverse")
- library(GEOquery)
- !gset <- getGEO(GEO = "GSE124226",destdir = ".",getGPL = F)#destdir="."表示下载到当前路径
- e2 <- gset[[1]]
- phe <- e2@phenoData@data #提取临床信息
- exp <- e2@assayData[["exprs"]] #提取表达矩阵
-
- 3.在GEO数据库中下载对应平台信息:(平台信息含探针对应的基因名) 该处ID_symbol处需替换
- library(data.table)
- ann=fread("ann.txt",sep="\t",header=T,data.table = F)#能在excel中打开的文件都可以用fread函数读取
- ID_symbol <- ann[,c("ID","Gene Symbol")]
-
- 4.部分探针对应的基因名有两个(如"DDR1 /// MIR4640" ),将排在后面的去掉 该处ID_symbol、ID_gene_symbol处需替换
- symbol <- ID_symbol$`Gene Symbol`
- symbol1 <- strsplit(symbol,split=" /// ",fixed=T)#fixed=T表示精确查找
- gene_symbol <- sapply(symbol1,function(x){x[1]})#提取每一个子列表中的第一个元素
- ID_gene_symbol <- data.frame(ID_symbol$ID,gene_symbol)
- rm(ID_symbol,symbol,symbol1,gene_symbol)
-
- 5.将表达矩阵与基因名合并
- library(dplyr)
- exp <- as.data.frame(exp)#merge函数只对两个data.frame进行合并
- exp1 <- merge(ID_gene_symbol,exp,by.x = 1,by.y = 0)
- exp2 <- distinct(exp1,gene_symbol,.keep_all=T)
- exp3 <- na.omit(exp2)#将缺失的基因名去除
- rownames(exp3) <- exp3$gene_symbol
- exp4 <- exp3[,-c(1,2)]
-
- 6.将对照组样本放在前面并进行批次校正
- !control <- c()#对照样本所在列
- !treat <- c(seq(1,35,2))##查看tumor样本所在列
- exp5 <- exp4[,c(control,treat)]#将对照样本放在前面
- !group_list <- c(rep("control",18),rep("treat",18))
- group_list <- factor(group_list)
- group_list <- relevel(group_list,ref="control")#强制限定顺序,control在前
-
- library(limma)
- exp6 <- normalizeBetweenArrays(exp5)#除去批次效应
- boxplot(exp5,outline=F,notch=T,col=group_list,las=2)
- boxplot(exp6,outline=F,notch=T,col=group_list,las=2)#绘制出的图形相同,说明样本已经进行过归一化处理
- rm(ID_gene_symbol,exp,exp1,exp2,exp3,exp4,exp5,control,treat)
- 至此,我们获得了想要的横坐标为基因名,纵坐标为样本名的表达矩阵exp6

- ex <- exp6
- qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
- LogC <-(qx[5]>100)||
- (qx[6]-qx[1] > 50 && qx[2] > 0) ||
- (qx[2]>0&&qx[2]<1&&qx[4]>1&&qx[4]<2)
-
- if(LogC){
- ex[which(ex<=0)]<-NaN
- exprSet <- log2(ex)
- print("需要取log2")}else{print("无需取log2")}
- rm(ex,exp6,LogC,qx)
- exp6 <- exprSet
- write.csv(exp6,"GSE34526.NORMALIZE.CSV")
- setwd("D:\\hebing")
- getwd()
- BiocManager::install("sva")
- # 安装包
- install.packages("FactoMineR")
- install.packages("factoextra")
-
- library(sva)
- library(FactoMineR)
- library(factoextra)
-
- GSE1=read.csv(file = "GSE34526.NORMALIZE.csv",header = T,row.names = 1)
- GSE2=read.csv(file = "GSE137684.NORMALIZE.csv",header = T,row.names = 1)
- GSE3=read.csv(file = "GSE80432.NORMALIZE.csv",header = T,row.names = 1)
-
-
- #GEO数据数据合并
- name1 <- intersect(rownames(GSE1),rownames(GSE2))
- expr <- cbind(GSE1[name1,],GSE2[name1,])
- #若是多个数据(>2),则执行如下命令
- name1 <- intersect(rownames(GSE1),rownames(GSE2))
- name2 <- intersect(name1,rownames(GSE3))
- expr <- cbind(GSE1[name2,],GSE2[name2,],GSE3[name2,])
-
- batch <- c(rep("GSE5090",17),rep("GSE43264",15),rep("GSE6798",29),rep("GSE34526",10),rep("GSE10946",23))
- tissue <- c(rep(c("control","treat","control","treat","control","treat","control","treat","control","treat"),c(8,9,7,8,13,16,3,7,11,12)))
- mode <- model.matrix(~tissue)
-
- ########combat去除批次效应
- combat_expr <- ComBat(dat = expr,
- batch = batch,
- mod = mode)
- #combat_expr就是校正后且合并后的GEO数据
-
- ####removeBatchEffect去除批次效应
- limma_expr <- removeBatchEffect(expr,batch = batch,design = mode)
-
- rm(expr,GSE1,GSE2,GSE3,name1,name2)
-
- #矫正前PCA分析
- pre.pca <- PCA(t(expr),graph = FALSE)#expr1与前文对应
- fviz_pca_ind(pre.pca,
- geom= "point",
- col.ind = batch,#batch与前文对应
- addEllipses = TRUE,#添加椭圆
- legend.title="Group" )#图例名为Group
- #此处保存一下图片,方便后续对比
-
- #矫正后PCA分析(1和2都运行,选择百分数值较小的一个进行后续分析)
- #1
- combat.pca <- PCA(t(combat_expr),graph = FALSE)
- fviz_pca_ind(combat.pca,
- geom= "point",
- col.ind = batch,
- addEllipses = TRUE,
- legend.title="Group" )
-
- #2或
- combat.pca <- PCA(t(limma_expr),graph = FALSE)
- fviz_pca_ind(combat.pca,
- geom= "point",
- col.ind = batch,
- addEllipses = TRUE,
- legend.title="Group" )
- write.csv(limma_expr,"limma_expr.csv")
- write.csv(combat_expr,"combat_expr.csv")
-
- #箱线图查看前后对比
- par(mfrow=c(1,2))
- boxplot(as.data.frame(expr),main="Original")
- boxplot(as.data.frame(combat_expr),main="Batch corrected")
-
- 选取较好的矫正方法进行后续分析

最终得到combat_expr.csv或limma_expr.csv,文件为合并后的GEO数据(行名样本名,列名基因名),样本名顺序按GSE1,GSE2,GSE3排列,每个GSE中对照组样本在前,疾病组样本在后
- tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
- setwd("C:\\Users\\LENOVO\\Desktop\\wgcna")
- limma_expr <- read.table("limma_expr.csv",sep=",",header=T,row.names = 1)
- data <- cbind(t(limma_expr),tissue)
- conData <- subset(data,tissue=="control")
- treatData <- subset(data,tissue=="treat")
- conNum <- nrow(conData)
- treatNum <- nrow(treatData)
- data=rbind(conData,treatData)
- data <- t(data[,-15068])
- write.csv(data,"limma_expr_control+treat.csv")
- rm(conData,treatData,limma_expr)
-
- fenzu=data.frame(Normal=c(rep(1,conNum),rep(0,treatNum)),
- Tumor=c(rep(0,conNum),rep(1,treatNum)))
- row.names(fenzu)=colnames(data)
- write.csv(fenzu,"fenzu.csv")#行名样品名,列名Normal和Tumor,是用1表示,否用0表示,分组也可用作临床信息

最终得到data(limma_expr_control+teart.csv),fenzu.csv
- library(limma)
- exprSet <- read.csv("limma_expr_control+treat.csv",row.names = 1)
- #构建分组矩阵
- fenzu <- read.csv("fenzu.csv",row.names = 1)
- table(fenzu)
- group <- c(rep("con",15),rep("treat",23))
- group <- factor(group,levels = c("con","treat"))
- design <- model.matrix(~0+group)
- colnames(design) <- levels(group)
- design
-
- ### 2.构建比较矩阵
- contrast.matrix <- makeContrasts(treat - con,levels=design)
-
-
- #线性拟合模型构建
-
- fit <- lmFit(exprSet,design)#线性模型拟合
- fit <- contrasts.fit(fit, contrast.matrix)
- fit <- eBayes(fit)#贝叶斯检验
-
- ### 4.最终得到差异分析结果
- allDiff=topTable(fit,number=Inf)
-
-
- #输出所有基因的差异情况
- write.table(allDiff,"allDiff.txt",sep="\t",col.names = NA)
- #write.csv(allDiff,"allDiff.csv")
-
- #基因上调和下调设定
- data <- allDiff
- data$significant <- "stable"
- data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
- data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"
-
-
- #输出差异结果
- diffSig=data[with(data,(abs(logFC)>1 & P.Value<0.05)),]
- write.table(diffSig,"diff_1_0.05.txt",sep="\t",col.names = NA,quote=F)
-
-
- #筛选差异基因(基础表达量高、变化明显、p值显著)
- select.log2FC <- abs(data$logFC)>1
- select.Pvalue <- data$P.Value<0.05
- select.vec <- select.log2FC & select.Pvalue
- table(select.vec)
-
- degs.list <- as.character(rownames(data))[select.vec] #筛选出的基因名
- write.table(degs.list,"DEG.txt",sep="\t",quote=F,row.names=F,col.names = F)

图形绘制
- library(ggplot2)
- library(ggrepel)
-
- #ggplot中的每一个+号代表加一个参数的内容
- pdf("volcano.pdf",width = 6,height = 5)
- ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-2,2)+ylim(0,5)+ #x轴为logFC,p值越小,y值越大
- geom_point(aes(color=significant),size=0.8)+theme_classic()+ #画散点图,size代表点的大小,颜色分类按照significant
- scale_color_manual(values = c("green","black","red"))+ #点的颜色
- geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+ #垂直于y轴的线
- geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+ #垂直于x轴的线
- theme(title = element_text(size=18),text = element_text(size=18))+
- labs(x="log2(foldchange)",y="-log10(P_Value)")
- dev.off()
在火山图中标注基因名称
- setwd("D:\\limma_expr\\limma差异分析")
- BiocManager::install("ggrepel")
- library(ggrepel)
- data <- read.table("allDiff.txt",sep="\t",header=T,row.names = 1)
- data$significant <- "stable"
- data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
- data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"
- label.deg <- c("SYK","FCGR3A","TLR4",
- "TYROBP","FCER1G","CD4","ITGAM","PLCG2","ITGB2","VAV1")
- p=ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-1.5,2)+ylim(0,5)+
- geom_point(aes(color=significant),size=0.8)+theme_classic()+
- scale_color_manual(values = c("green","black","red"))+
- geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+
- geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+
- theme(title = element_text(size=18),text = element_text(size=18))+
- labs(x="log2(foldchange)",y="-log10(P_Value)")
- data_selected <- data[label.deg,] #筛选出的基因所在行
-
- p + # 基于普通火山图p
- geom_point(data = data_selected, # 添加强调显示的散点,仅限于上调基因
- aes(x = logFC, y = -log10(P.Value)),
- color = 'red3', size = 4.5, alpha = 0.2) + # 设置散点的颜色、大小和透明度
- geom_label_repel(data = data_selected, # 添加带边框的标签,仅限于上调基因
- aes(x = logFC, y = -log10(P.Value), label = rownames(data_selected)),
- seed = 233, # 设置随机数种子,用于确定标签位置
- size = 3.5, # 设置标签的字体大小
- color = 'black', # 设置标签的字体颜色
- min.segment.length = 0, # 始终为标签添加指引线段
- force = 2, # 设置标签重叠时的排斥力
- force_pull = 2, # 设置标签与数据点之间的吸引力
- box.padding = 0.4, # 设置标签周围的填充量
- max.overlaps = Inf) # 设置排斥重叠过多标签的阈值为无穷大,保持始终显示所有标签
-
-

分组、选基因
- !annotation_col1 <- data.frame(
- database <- c(rep("GEO",36)),
- CellType <- c(rep("control",18),rep("treatment",18))
- )
- rownames(annotation_col1) <- colnames(exp6)
- class(exp6)
- exp6 <- as.data.frame(exp6)
- exprset.map <- exp6[label.deg,]#选取想要绘图的基因
绘图
- install.packages("pheatmap")
- library(pheatmap)
- pheatmap::pheatmap(exprset.map,#热图的数据
- cluster_rows = F,#行聚类(行间表达相似则聚类)
- cluster_cols=F,#列聚类,可以看出样本之间的区分度
- annotation_col = annotation_col1,
- show_colnames = F, #不展示样品名
- scale="row",#以行标准化
- color=colorRampPalette(c("blue","white","red"))(100))#100个渐变色
- exp.x=exp6["TTC32",]
-
- z=t(exp.x)#行名为样本名,列为基因表达量
- z=as.data.frame(z)
- !z$type=c(rep("control",18),rep("treatment",18))
-
- library(ggpubr)
- library(ggsignif)
- library(ggplot2)
-
- ggboxplot(z,x="type",y="TTC32",
- width = 0.6,fill="type",
- notch = T,palette = c("#00AFBB", "red","#E7B800"),
- add = "jitter",shape="type")
-
- ###加个p值
- p=ggboxplot(z,x="type",y="TTC32",
- width = 0.6,fill="type",
- notch = T,palette = c("#00AFBB", "red","#E7B800"),
- add = "jitter",shape="type")
-
- p + stat_compare_means(aes(group =type))

若对照组和疾病组来自同一个人,可以在点与点之间连线
- !z$pairinfo=pairinfo=rep(1:18,2) 第1个和第19个样本来自同一个人
- !ggplot(z, aes(type,TTC32,fill=type)) +
- geom_boxplot() +
- geom_point(size=2, alpha=0.5) +
- geom_line(aes(group=pairinfo), colour="black", linetype="11") +
- xlab("") +
- ylab(paste("Expression of ","TTC32"))+
- theme_classic()+
- theme(legend.position = "none")
- library(ggsignif)
- compaired <- list(c("control", "treatment"))
-
- library(ggsci)
- ggplot(z,aes(type,TTC32,fill=type)) +
- geom_violin()+
- geom_signif(comparisons =compaired ,step_increase = 0.1,map_signif_level = c("***"=0.001, "**"=0.01, "*"=0.05),test =t.test,size=2,textsize = 6)+
- geom_jitter(shape=16, position=position_jitter(0.2))
- #筛选差异基因(基础表达量高、变化明显、p值显著)
- select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
- select.log2FC <- abs(data$logFC)>1
- select.Pvalue <- data$P.Value<0.05
- select.vec <- select.FPKM & select.log2FC & select.Pvalue
- table(select.vec)
-
- degs.list <- as.character(rownames(data))[select.vec] #筛选出的基因名
-
- ##BP、CC、MF改ont值即可
- library(org.Hs.eg.db)
- library(clusterProfiler)
- erich.go.BP = enrichGO(gene =degs.list,
- OrgDb = org.Hs.eg.db,
- keyType = "SYMBOL",
- ont = "BP",
- pvalueCutoff = 0.05,
- qvalueCutoff = 0.05)
- dotplot(erich.go.BP,showCategory = 8)#展示8个通路#横坐标为GeneRatio,代表差异基因占通路的百分比
- #或barplot(erich.go.BP,showCategory = 8)
-
- erich.go.BP=erich.go.BP@result#通路信息
- write.table(erich.go.BP,"erich.go.BP.deg.con.hf.txt",sep = "\t",col.names = NA)
-
详细版
- setwd("")
-
- #install.packages("colorspace")
- #install.packages("stringi")
- #install.packages("ggplot2")
- #install.packages("circlize")
- #install.packages("RColorBrewer")
-
- #if (!requireNamespace("BiocManager", quietly = TRUE))
- # install.packages("BiocManager")
- #BiocManager::install("org.Hs.eg.db")
- #BiocManager::install("DOSE")
- #BiocManager::install("clusterProfiler")
- #BiocManager::install("enrichplot")
- #BiocManager::install("ComplexHeatmap")
-
-
-
- library(clusterProfiler)
- library(org.Hs.eg.db)
- library(enrichplot)
- library(ggplot2)
- library(circlize)
- library(RColorBrewer)
- library(dplyr)
- library(ComplexHeatmap)
-
- pvalueFilter=0.05 #p值过滤条件
- adjPvalFilter=0.05 #矫正后的p值过滤条件
-
- #定义颜色
- colorSel="p.adjust"
- if(adjPvalFilter>0.05){
- colorSel="pvalue"
- }
-
- rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F) #??ȡ?????ļ?
-
- #提取差异基因的名称,并将其转换为entrezID
- genes=unique(as.vector(rt[,1]))
- entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
- entrezIDs=as.character(entrezIDs)
- rt=cbind(rt, entrezIDs)
- rt=rt[rt[,"entrezIDs"]!="NA",] #去除entrezID为NA的基因
- gene=rt$entrezID
- #gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
- geneFC=rt$logFC
- names(geneFC)=gene
-
- #GO富集分析
- kk=enrichGO(gene=gene, OrgDb=org.Hs.eg.db, pvalueCutoff=1, qvalueCutoff=1, ont="all", readable=T)#ont="all"表示同时进行BP,CC,MF
- GO=as.data.frame(kk)
- GO=GO[(GO$pvalue<pvalueFilter & GO$p.adjust<adjPvalFilter),]
- write.table(GO, file="GO.txt", sep="\t", quote=F, row.names = F)
-
- #柱状图
- pdf(file="barplot.pdf", width=8.5, height=7)
- bar=barplot(kk, drop=TRUE, showCategory=10, label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
- print(bar)
- dev.off()
-
- #气泡图
- pdf(file="bubble.pdf", width=8.5, height=7)
- bub=dotplot(kk, showCategory=10, orderBy="GeneRatio", label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
- print(bub)
- dev.off()
-
- #基因和功能的关系图
- pdf(file="cnetplot.pdf", width=8, height=5.25)
- cnet=cnetplot(kk, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
- print(cnet)
- dev.off()
-
-
- ###########GO圈图#############################
- ontology.col=c("#00AFBB", "#E7B800", "#90EE90")
- data=GO[order(GO$pvalue),]
- datasig=data[data$pvalue<0.05,,drop=F]
- BP = datasig[datasig$ONTOLOGY=="BP",,drop=F]
- CC = datasig[datasig$ONTOLOGY=="CC",,drop=F]
- MF = datasig[datasig$ONTOLOGY=="MF",,drop=F]
- BP = head(BP,6)
- CC = head(CC,6)
- MF = head(MF,6)
- data = rbind(BP,CC,MF)
- main.col = ontology.col[as.numeric(as.factor(data$ONTOLOGY))]
-
- #整理圈图数据
- BgGene = as.numeric(sapply(strsplit(data$BgRatio,"/"),'[',1))
- Gene = as.numeric(sapply(strsplit(data$GeneRatio,'/'),'[',1))
- ratio = Gene/BgGene
- logpvalue = -log(data$pvalue,10)
- logpvalue.col = brewer.pal(n = 8, name = "Reds")
- f = colorRamp2(breaks = c(0,2,4,6,8,10,15,20), colors = logpvalue.col)
- BgGene.col = f(logpvalue)
- df = data.frame(GO=data$ID,start=1,end=max(BgGene))
- rownames(df) = df$GO
- bed2 = data.frame(GO=data$ID,start=1,end=BgGene,BgGene=BgGene,BgGene.col=BgGene.col)
- bed3 = data.frame(GO=data$ID,start=1,end=Gene,BgGene=Gene)
- bed4 = data.frame(GO=data$ID,start=1,end=max(BgGene),ratio=ratio,col=main.col)
- bed4$ratio = bed4$ratio/max(bed4$ratio)*9.5
-
- #绘制圈图主体部分
- pdf("GO.circlize.pdf",width=10,height=10)
- par(omi=c(0.1,0.1,0.1,1.5))
- circos.par(track.margin=c(0.01,0.01))
- circos.genomicInitialize(df,plotType="none")
- circos.trackPlotRegion(ylim = c(0, 1), panel.fun = function(x, y) {
- sector.index = get.cell.meta.data("sector.index")
- xlim = get.cell.meta.data("xlim")
- ylim = get.cell.meta.data("ylim")
- circos.text(mean(xlim), mean(ylim), sector.index, cex = 0.8, facing = "bending.inside", niceFacing = TRUE)
- }, track.height = 0.08, bg.border = NA,bg.col = main.col)
-
- for(si in get.all.sector.index()) {
- circos.axis(h = "top", labels.cex = 0.6, sector.index = si,track.index = 1,
- major.at=seq(0,max(BgGene),by=100),labels.facing = "clockwise")
- }
- f = colorRamp2(breaks = c(-1, 0, 1), colors = c("green", "black", "red"))
- circos.genomicTrack(bed2, ylim = c(0, 1),track.height = 0.1,bg.border="white",
- panel.fun = function(region, value, ...) {
- i = getI(...)
- circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = value[,2],
- border = NA, ...)
- circos.genomicText(region, value, y = 0.4, labels = value[,1], adj=0,cex=0.8,...)
- })
- circos.genomicTrack(bed3, ylim = c(0, 1),track.height = 0.1,bg.border="white",
- panel.fun = function(region, value, ...) {
- i = getI(...)
- circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = '#BA55D3',
- border = NA, ...)
- circos.genomicText(region, value, y = 0.4, labels = value[,1], cex=0.9,adj=0,...)
- })
- circos.genomicTrack(bed4, ylim = c(0, 10),track.height = 0.35,bg.border="white",bg.col="grey90",
- panel.fun = function(region, value, ...) {
- cell.xlim = get.cell.meta.data("cell.xlim")
- cell.ylim = get.cell.meta.data("cell.ylim")
- for(j in 1:9) {
- y = cell.ylim[1] + (cell.ylim[2]-cell.ylim[1])/10*j
- circos.lines(cell.xlim, c(y, y), col = "#FFFFFF", lwd = 0.3)
- }
- circos.genomicRect(region, value, ytop = 0, ybottom = value[,1], col = value[,2],
- border = NA, ...)
- #circos.genomicText(region, value, y = 0.3, labels = value[,1], ...)
- })
- circos.clear()
- #绘制圈图中间的图例
- middle.legend = Legend(
- labels = c('Number of Genes','Number of Select','Rich Factor(0-1)'),
- type="points",pch=c(15,15,17),legend_gp = gpar(col=c('pink','#BA55D3',ontology.col[1])),
- title="",nrow=3,size= unit(3, "mm")
- )
- circle_size = unit(1, "snpc")
- draw(middle.legend,x=circle_size*0.42)
- #绘制GO分类的图例
- main.legend = Legend(
- labels = c("Biological Process", "Cellular Component", "Molecular Function"), type="points",pch=15,
- legend_gp = gpar(col=ontology.col), title_position = "topcenter",
- title = "ONTOLOGY", nrow = 3,size = unit(3, "mm"),grid_height = unit(5, "mm"),
- grid_width = unit(5, "mm")
- )
- #绘制富集显著性pvalue的图例
- logp.legend = Legend(
- labels=c('(0,2]','(2,4]','(4,6]','(6,8]','(8,10]','(10,15]','(15,20]','>=20'),
- type="points",pch=16,legend_gp=gpar(col=logpvalue.col),title="-log10(Pvalue)",
- title_position = "topcenter",grid_height = unit(5, "mm"),grid_width = unit(5, "mm"),
- size = unit(3, "mm")
- )
- lgd = packLegend(main.legend,logp.legend)
- circle_size = unit(1, "snpc")
- print(circle_size)
- draw(lgd, x = circle_size*0.85, y=circle_size*0.55,just = "left")
- dev.off()

- DEG.entrez_id = mapIds(x = org.Hs.eg.db,
- keys = degs.list,
- keytype = "SYMBOL",
- column = "ENTREZID")
-
- erich.kegg.res <- enrichKEGG(gene = DEG.entrez_id,
- organism = "hsa",#组织名称,hsa表示人
- keyType = "kegg")
-
- dotplot(erich.kegg.res)
-
- zzh=erich.kegg.res@result
- write.table(zzh,"erich.kegg.res.6.19.txt",sep = "\t",col.names = NA)
详细版
- setwd("")
- #install.packages("colorspace")
- #install.packages("stringi")
- #install.packages("ggplot2")
-
- #if (!requireNamespace("BiocManager", quietly = TRUE))
- # install.packages("BiocManager")
- #BiocManager::install("org.Hs.eg.db")
- #BiocManager::install("DOSE")
- #BiocManager::install("clusterProfiler")
- #BiocManager::install("enrichplot")
-
-
-
- library("clusterProfiler")
- library("org.Hs.eg.db")
- library("enrichplot")
- library("ggplot2")
-
- pvalueFilter=0.05
- adjPvalFilter=0.05
-
- #定义图形的颜色
- colorSel="p.adjust"
- if(adjPvalFilter>0.05){
- colorSel="pvalue"
- }
-
-
- rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F) #??ȡ?????ļ?
-
- #提取差异基因名称,将基因名转换为entrezID
- genes=unique(as.vector(rt[,1]))
- entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
- entrezIDs=as.character(entrezIDs)
- rt=cbind(rt, entrezIDs)
- rt=rt[rt[,"entrezIDs"]!="NA",]
- gene=rt$entrezID
- #gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
- geneFC=rt$logFC
- names(geneFC)=gene
-
- #kegg富集分析
- kk <- enrichKEGG(gene=gene, organism="hsa", pvalueCutoff=1, qvalueCutoff=1)
- KEGG=as.data.frame(kk)
- KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$genes[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/")))
- KEGG=KEGG[(KEGG$pvalue<pvalueFilter & KEGG$p.adjust<adjPvalFilter),]
- #输出显著富集的结果
- write.table(KEGG, file="KEGG.txt", sep="\t", quote=F, row.names = F)
-
- #定义显示通路的数目
- showNum=30
- if(nrow(KEGG)<showNum){
- showNum=nrow(KEGG)
- }
-
- #柱状图
- pdf(file="barplot.pdf", width=8, height=7)
- barplot(kk, drop=TRUE, showCategory=showNum, label_format=100, color=colorSel)
- dev.off()
-
- #气泡图
- pdf(file="bubble.pdf", width=8, height=7)
- dotplot(kk, showCategory=showNum, orderBy="GeneRatio", label_format=100, color=colorSel)
- dev.off()
-
- #基因和通路的关系图
- pdf(file="cnetplot.pdf", width=9, height=5.25)
- kkx=setReadable(kk, 'org.Hs.eg.db', 'ENTREZID')
- cnet=cnetplot(kkx, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
- print(cnet)
- dev.off()

- options(stringsAsFactors = FALSE)
- rm(list=ls())
- setwd("D:\\immune cell infiltration\\GEO")
- kegg=read.table("1.27.txt",header = T,sep = "\t")
- k = data.frame(kegg)
- library(ggplot2)
- library(dplyr)
-
- #将GeneRatio转换为小数点样式
- x=k$GeneRatio
- x1=as.character(x)
- a=strsplit(x1,split="/",fixed=T)
- be=sapply(a,function(x){x[1]})
- be1=as.numeric(be)
- after=sapply(a,function(x){x[2]})
- after=as.numeric(after)
- ?strsplit
- k$GeneRatio = be1/after
- rm(x,x1,a,be,be1,after)
- #或者
- before <- as.numeric(sub("/\\d+$",replacement = "", k$GeneRatio))
- after <- as.numeric(sub("^\\d+/", "", k$GeneRatio))
-
- font.size =10
-
- k %>%
- ## 对进行p值排序
- arrange(p.adjust) %>%
- ##指定富集的通路数目(10个)
- slice(1:10) %>%
- ## 开始ggplot2 作图,其中fct_reorder调整因子level的顺序
- ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+
- ## 画出点图,size=Count代表气泡越大,count数越高
- geom_point(aes(color=p.adjust, size = Count)) +
- ## 调整颜色,guide_colorbar调整色图的方向
- scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
- ## 调整泡泡的大小
- scale_size_continuous(range=c(3, 8))+
- ## 如果用ylab("")或出现左侧空白
- labs(y=NULL) +
- ## 如果没有这一句,上方会到顶
- ggtitle("")+
- ## 设定主题
- theme_bw() +
- theme(axis.text.x = element_text(colour = "black",
- size = font.size, vjust =1 ),
- axis.text.y = element_text(colour = "black",
- size = font.size, hjust =1 ),
- axis.title = element_text(margin=margin(10, 5, 0, 0),
- color = "black",size = font.size),
- axis.title.y = element_text(angle=90))

- select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
- table(select.FPKM)
- gs.list <- as.character(rownames(data))[select.FPKM] #筛选出的基因名
- f1=data[gs.list,]
- f1$symbol <- rownames(f1)
- exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
- rm(f1)
- ###id转化
- expm.id <- bitr(exp.fc$symbol, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
- exp.fc.id <- merge(exp.fc, expm.id,by.x="symbol", by.y="SYMBOL", all=F)
- exp.fc.id=na.omit(exp.fc.id)
- exp.fc.id.sorted <- exp.fc.id[order(exp.fc.id$logFC, decreasing = T),]
-
- id.fc <- exp.fc.id.sorted$logFC
- names(id.fc) <- exp.fc.id.sorted$ENTREZID
- head(id.fc)
- rm(expm.id,exp.fc.id,exp.fc.id.sorted)
- gsea <- gseKEGG(id.fc, organism = "hsa",pvalueCutoff = 0.5)
- go_result <- gseGO(geneList = id.fc,
- ont = "BP",
- OrgDb = org.Hs.eg.db,
- pvalueCutoff = 0.5)
-
-
- ####查看一下富集结果(NES绝对值大于1.5、p.adjust<0.05富集效果较好)
- View(go_result@result)
-
- gseaplot2(gsea, geneSetID = 1,title = "abc")#绘制gsea中的第一条通路
- gseaplot2(gsea, geneSetID = c(2,3))#第二和第三条通路

- select.FPKM <- data$AveExpr>5 #筛选表达量>5的基因
- table(select.FPKM)
- gs.list <- as.character(rownames(data))[select.FPKM] #筛选出的基因名
- f1=data[gs.list,]
- f1$symbol <- rownames(f1)
- f1 <- distinct(f1,symbol,.keep_all = T)
- exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
- rm(f1)
-
- deg = exp.fc$logFC
- names(deg) = exp.fc$symbol
- deg= sort(deg,decreasing = T)
- head(deg)
- install.packages("msigdbr")
- library(msigdbr)
- h.kegg= msigdbr(species ="Homo sapiens",category = "C2",subcategory = "KEGG")
- length(unique(h.kegg$gs_exact_source))
- h.kegg= h.kegg[,c(3,4)]
- gsea <- GSEA(deg,pvalueCutoff = 0.5, TERM2GENE =h.kegg)
- gseaplot2(gsea, geneSetID = 6, title = gsea$Description[6])
- gsea1=gsea@result
-
- write.table(gsea1,"gsea.res.txt",sep = "\t",col.names = NA)

- install.packages("UpSetR")
- library(UpSetR)
- setwd("D:\\immune cell infiltration\\multiarray")
- rt=read.csv("score.csv", header=T, sep=",", check.names=F, row.names=1)
- colnames(rt)=c(colnames(rt)[2:ncol(rt)], "N")
- scoreType=c("MCC", "DMNC", "MNC", "Degree", "EPC","BottleNeck", "EcCentricity","Closeness", "Radiality", "Betweenness", "Stress","ClusteringCoefficient")
-
- #对打分进行排序
- upsetList=c()
- for(i in scoreType){
- data=rt[order(rt[,i],decreasing=T),]
- hubGene=row.names(data)[1:50] #筛选打分最高的前50个基因
- upsetList[[i]]=hubGene
- }
- upsetData=fromList(upsetList)
-
-
- #输出图形
- pdf(file="upset.pdf", width=9, height=6, onefile=FALSE)
- upset(upsetData,
- nsets = length(scoreType), #展示打分类型个数
- nintersects = 50, #展示基因集数目
- order.by = "freq", #排序的方式(按照基因数目排序)
- show.numbers = "yes", #ֵ柱状图上方是否显示数值
- number.angles = 20, #字体角度
- point.size = 1.5, #点的大小
- matrix.color="red", #点的颜色
- line.size = 0.8, #线条粗细
- mainbar.y.label = "Gene Intersections",
- sets.x.label = "Set Size")
- dev.off()
-
- #获取交集基因,作为网络的核心基因
- intersectGenes=Reduce(intersect, upsetList)
- intersectGenes=as.data.frame(intersectGenes)
- colnames(intersectGenes)="Gene"
- write.csv(file="hubGenes.csv", intersectGenes, quote=F, row.names=F)

- #if (!requireNamespace("BiocManager", quietly = TRUE))
- # install.packages("BiocManager")
- #BiocManager::install("limma")
-
- #install.packages("pheatmap")
- library(limma)
- library(pheatmap)
- setwd("")
-
- exp=read.table("exp6.csv", header=T, sep=",", check.names=F,row.names = 1)
- dimnames=list(rownames(exp),colnames(exp))
- data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
- data=avereps(data)
-
- geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
- data=data[as.vector(geneRT[,1]),]#核心基因的表达矩阵
- annotation_col1 <- data.frame(
- database <- c(rep("GEO124226",8)),
- Type <- c(rep("control",4),rep("treatment",4))
- )
- rownames(annotation_col1) <- colnames(data)
- colnames(annotation_col1) <- c("database","Type")
- #绘制核心基因热图
- pdf(file="hubGene.heatmap.pdf", width=7.5, height=3.5)
- pheatmap(data,
- annotation_col =annotation_col1,
- color = colorRampPalette(c(rep("blue",2), "white", rep("red",2)))(50),
- cluster_cols =F,
- show_colnames = F,
- scale="row",
- fontsize = 8,
- fontsize_row=7,
- fontsize_col=8)
- dev.off()

- #install.packages("glmnet")
- #install.packages("pROC")
-
- library(glmnet)
- library(pROC)
-
- expFile="limma_expr_control+treat.csv"
- geneFile="cytohubba_0.9_10.csv" #核心基因
- setwd("D:\\limma_expr\\WGCNA1")
-
- rt=read.table(expFile, header=T, sep=",", check.names=F, row.names=1)
- geneRT=read.table(geneFile, header=T, sep=",", check.names=F)
- geneRT <- data.frame(geneRT$Name)
- #获取表达矩阵样品信息
- y <- c(rep(0,15),rep(1,23))
-
-
- #对核心基因进行循环,绘制ROC曲线
- bioCol=rainbow(nrow(geneRT), s=0.9, v=0.9) #定义图形颜色
- aucText=c()
- k=0
- for(x in as.vector(geneRT[,1])){
- k=k+1
- 绘制ROC曲线
- roc1=roc(y, as.numeric(rt[x,])) #得到ROC曲线的参数
- if(k==1){
- pdf(file="ROC.genes.pdf", width=5, height=4.75)
- plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="")
- aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
- }else{
- plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="", add=TRUE)
- aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
- }
- }
- #绘制图例,得到ROC曲线下面积
- legend("bottomright", aucText, lwd=2, bty="n", col=bioCol[1:(ncol(rt)-1)])
- dev.off()
-
- #构建逻辑模型
- rt=rt[as.vector(geneRT[,1]),] #提取核心基因表达量
- rt=as.data.frame(t(rt))
- logit=glm(y ~ ., family=binomial(link='logit'), data=rt)
- pred=predict(logit, newx=rt) #得到模型的打分
-
- #绘制模型的ROC曲线
- roc1=roc(y, as.numeric(pred)) #得到模型ROC曲线的参数
- ci1=ci.auc(roc1, method="bootstrap") #ROC曲线下面积的波动范围
- ciVec=as.numeric(ci1)
- pdf(file="ROC.model.pdf", width=5, height=4.75)
- plot(roc1, print.auc=TRUE, col="red", legacy.axes=T, main="Model")
- text(0.39, 0.43, paste0("95% CI: ",sprintf("%.03f",ciVec[1]),"-",sprintf("%.03f",ciVec[3])), col="red")
- dev.off()

准备矫正后的合并文件combat_expr.csv或limma_expr.csv
cytoscape评分结果文件hubGenes.csv,免疫细胞结果文件cibersort_result.txt和Diff.cell.txt
- #if (!requireNamespace("BiocManager", quietly = TRUE))
- # install.packages("BiocManager")
- #BiocManager::install("limma")
-
- #install.packages("tidyverse")
- #install.packages("ggplot2")
- #install.packages("reshape2")
- #install.packages("ggpubr")
- install.packages("ggExtra")
- #install.packages("corrplot")
-
- library(limma)
- library(reshape2)
- library(tidyverse)
- library(ggplot2)
- library(ggpubr)
- library(ggExtra)
- library(corrplot)
-
- corFilter=0.4 #相关系数过滤文件
- pvalueFilter=0.001 #相关性检验pvalue过滤文件
-
-
- exp=read.table("limma_expr.csv", header=T, sep=",", check.names=F,row.names = 1)
- exp=as.matrix(exp)
- dimnames=list(rownames(exp),colnames(exp))
- data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
- data=avereps(data)
-
- #提取核心基因的表达量
- geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
- data=data[as.vector(geneRT[,1]),]#行名基因名,列名样本名
-
- #去除对照组样品
- group <- data.frame(sample=character(38),group=character(38))
- group$sample <- colnames(exp)
- tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
- group$group <- tissue
- data <- cbind(t(data),group)
- data=data[data$group=="treat",]#行名为样本名,列名为基因名
- data <- data[,-c(13,14)]
- data <- as.matrix(data)
-
-
- #读取核心免疫细胞的含量
- immune=read.table("cibersort_result.txt", header=T, sep="\t", check.names=F, row.names=1)
- cellRT=read.csv("Diff.cell.txt", header=F, sep="\t", check.names=F)
- immune=immune[,as.vector(cellRT[,1])]#行名样本名,列名为免疫细胞名
- #提取基因表达矩阵和免疫浸润结果文件中都包含的样本信息
- sameSample=intersect(row.names(data), row.names(immune))
- data=data[sameSample,,drop=F]
- immune=immune[sameSample,,drop=F]#都只含实验组样品
-
- #相关性分析
- outTab=data.frame()
- #对免疫细胞进行循环
- for(cell in colnames(immune)){
- if(sd(immune[,cell])==0){next}
- #对核心基因进行循环
- for(gene in colnames(data)){
- x=as.numeric(data[,gene])
- y=as.numeric(immune[,cell])
- corT=cor.test(x,y,method="spearman")
- cor=corT$estimate
- pvalue=corT$p.value
- #对满足条件的免疫细胞进行可视化,绘制相关性图形
- if((abs(cor)>corFilter) & (pvalue<pvalueFilter)){
- df1=as.data.frame(cbind(x,y))
- p1=ggplot(df1, aes(x, y)) +
- xlab(paste0(gene, " expression")) + ylab(cell)+
- geom_point() + geom_smooth(method="lm",formula = y ~ x) + theme_bw()+
- stat_cor(method = 'spearman', aes(x =x, y =y))
- #相关性图形
- pdf(file=paste0("cor.", gene, "_", cell, ".pdf"), width=5, height=4.6)
- print(p1)
- dev.off()
- }
- }
- }
-
- #相关性热图ͼ
- data=cbind(data, immune)#行名为实验组样品名,列名为基因名和免疫细胞名
- write.csv(data,"data")
- a <- read.csv("data",header=T,row.names = 1)
- pdf("corHeatmap.pdf", width=9, height=9)
- corrplot(corr=cor(a),
- method = "color", #图形的展示形式
- order = "original", #免疫细胞的排序方式
- tl.col="black", #字体颜色
- addCoef.col = "black", #相关系数字体颜色
- number.cex = 0.8, #相关系数字体大小
- col=colorRampPalette(c("blue", "white", "red"))(50), #ͼ????ɫ??????
- )
- dev.off()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。