赞
踩
本文介绍了CIBERSORT两种使用方法,大家可以自行选择,方法二简单些,方法一原始些
本文顺便倡议大家使用Rproject来管理代码,感谢生信技能树jimmy老师让我知道了这么方便的玩意,再也不用拼命setwd()和getwd()了,不想看这部分可以直接下滑。
CIBERSORTx是原版网站,建议大家去学习,并且学习他们发的经典文章
鸣谢:生信技能树jimmy老师和 Biomamba 生信基地 BIOMAMBA老师
- # install packages 这三个安装不成功的话,就安后面的bseqsc包也行
- install.packages('e1071')
- install.pacakges('parallel')
- install.packages('preprocessCore')
- library(e1071)
- library(preprocessCore)
- library(parallel)
-
- install.packages('devtools')
- library(devtools)
- devtools::install_github('shenorrlab/bseqsc')
- library(bseqsc)#这个包携带大量CIBERSORT的依赖,前三个安装不好可以安装他
此法使用Cibersort工具需要三个文件:
1、sourcecibersort.R
2、LM22.txt
3、genes_exp.txt
直接把下列代码新建一个script,然后保存,保存名字为sourcecibersort.R
- #' CIBERSORT R script v1.03 (last updated 07-10-2015)
- #' Note: Signature matrix construction is not currently available; use java version for full functionality.
- #' Author: Aaron M. Newman, Stanford University (amnewman@stanford.edu)
- #' Requirements:
- #' R v3.0 or later. (dependencies below might not work properly with earlier versions)
- #' install.packages('e1071')
- #' install.pacakges('parallel')
- #' install.packages('preprocessCore')
- #' if preprocessCore is not available in the repositories you have selected, run the following:
- #' source("http://bioconductor.org/biocLite.R")
- #' biocLite("preprocessCore")
- #' Windows users using the R GUI may need to Run as Administrator to install or update packages.
- #' This script uses 3 parallel processes. Since Windows does not support forking, this script will run
- #' single-threaded in Windows.
- #'
- #' Usage:
- #' Navigate to directory containing R script
- #'
- #' In R:
- #' source('CIBERSORT.R')
- #' results <- CIBERSORT('sig_matrix_file.txt','mixture_file.txt', perm, QN)
- #'
- #' Options:
- #' i) perm = No. permutations; set to >=100 to calculate p-values (default = 0)
- #' ii) QN = Quantile normalization of input mixture (default = TRUE)
- #'
- #' Input: signature matrix and mixture file, formatted as specified at http://cibersort.stanford.edu/tutorial.php
- #' Output: matrix object containing all results and tabular data written to disk 'CIBERSORT-Results.txt'
- #' License: http://cibersort.stanford.edu/CIBERSORT_License.txt
- #' Core algorithm
- #' @param X cell-specific gene expression
- #' @param y mixed expression per sample
- #' @export
- CoreAlg <- function(X, y){
-
- #try different values of nu
- svn_itor <- 3
-
- res <- function(i){
- if(i==1){nus <- 0.25}
- if(i==2){nus <- 0.5}
- if(i==3){nus <- 0.75}
- model<-e1071::svm(X,y,type="nu-regression",kernel="linear",nu=nus,scale=F)
- model
- }
-
- if(Sys.info()['sysname'] == 'Windows') out <- parallel::mclapply(1:svn_itor, res, mc.cores=1) else
- out <- parallel::mclapply(1:svn_itor, res, mc.cores=svn_itor)
-
- nusvm <- rep(0,svn_itor)
- corrv <- rep(0,svn_itor)
-
- #do cibersort
- t <- 1
- while(t <= svn_itor) {
- weights = t(out[[t]]$coefs) %*% out[[t]]$SV
- weights[which(weights<0)]<-0
- w<-weights/sum(weights)
- u <- sweep(X,MARGIN=2,w,'*')
- k <- apply(u, 1, sum)
- nusvm[t] <- sqrt((mean((k - y)^2)))
- corrv[t] <- cor(k, y)
- t <- t + 1
- }
-
- #pick best model
- rmses <- nusvm
- mn <- which.min(rmses)
- model <- out[[mn]]
-
- #get and normalize coefficients
- q <- t(model$coefs) %*% model$SV
- q[which(q<0)]<-0
- w <- (q/sum(q))
-
- mix_rmse <- rmses[mn]
- mix_r <- corrv[mn]
-
- newList <- list("w" = w, "mix_rmse" = mix_rmse, "mix_r" = mix_r)
-
- }
- #' do permutations
- #' @param perm Number of permutations
- #' @param X cell-specific gene expression
- #' @param y mixed expression per sample
- #' @export
- doPerm <- function(perm, X, Y){
- itor <- 1
- Ylist <- as.list(data.matrix(Y))
- dist <- matrix()
-
- while(itor <= perm){
- #print(itor)
-
- #random mixture
- yr <- as.numeric(Ylist[sample(length(Ylist),dim(X)[1])])
-
- #standardize mixture
- yr <- (yr - mean(yr)) / sd(yr)
-
- #run CIBERSORT core algorithm
- result <- CoreAlg(X, yr)
-
- mix_r <- result$mix_r
-
- #store correlation
- if(itor == 1) {dist <- mix_r}
- else {dist <- rbind(dist, mix_r)}
-
- itor <- itor + 1
- }
- newList <- list("dist" = dist)
- }
-
- #' Main functions
- #' @param sig_matrix file path to gene expression from isolated cells
- #' @param mixture_file heterogenous mixed expression
- #' @param perm Number of permutations
- #' @param QN Perform quantile normalization or not (TRUE/FALSE)
- #' @export
- CIBERSORT <- function(sig_matrix, mixture_file, perm=0, QN=TRUE){
-
- #read in data
- X <- read.table(sig_matrix,header=T,sep="\t",row.names=1,check.names=F)
- Y <- read.table(mixture_file, header=T, sep="\t", row.names=1,check.names=F)
-
- X <- data.matrix(X)
- Y <- data.matrix(Y)
-
- #order
- X <- X[order(rownames(X)),]
- Y <- Y[order(rownames(Y)),]
-
- P <- perm #number of permutations
-
- #anti-log if max < 50 in mixture file
- if(max(Y) < 50) {Y <- 2^Y}
-
- #quantile normalization of mixture file
- if(QN == TRUE){
- tmpc <- colnames(Y)
- tmpr <- rownames(Y)
- Y <- preprocessCore::normalize.quantiles(Y)
- colnames(Y) <- tmpc
- rownames(Y) <- tmpr
- }
-
- #intersect genes
- Xgns <- row.names(X)
- Ygns <- row.names(Y)
- YintX <- Ygns %in% Xgns
- Y <- Y[YintX,]
- XintY <- Xgns %in% row.names(Y)
- X <- X[XintY,]
-
- #standardize sig matrix
- X <- (X - mean(X)) / sd(as.vector(X))
-
- #empirical null distribution of correlation coefficients
- if(P > 0) {nulldist <- sort(doPerm(P, X, Y)$dist)}
-
- #print(nulldist)
-
- header <- c('Mixture',colnames(X),"P-value","Correlation","RMSE")
- #print(header)
-
- output <- matrix()
- itor <- 1
- mixtures <- dim(Y)[2]
- pval <- 9999
-
- #iterate through mixtures
- while(itor <= mixtures){
-
- y <- Y[,itor]
-
- #standardize mixture
- y <- (y - mean(y)) / sd(y)
-
- #run SVR core algorithm
- result <- CoreAlg(X, y)
-
- #get results
- w <- result$w
- mix_r <- result$mix_r
- mix_rmse <- result$mix_rmse
-
- #calculate p-value
- if(P > 0) {pval <- 1 - (which.min(abs(nulldist - mix_r)) / length(nulldist))}
-
- #print output
- out <- c(colnames(Y)[itor],w,pval,mix_r,mix_rmse)
- if(itor == 1) {output <- out}
- else {output <- rbind(output, out)}
-
- itor <- itor + 1
-
- }
-
- #save results
- write.table(rbind(header,output), file="CIBERSORT-Results.txt", sep="\t", row.names=F, col.names=F, quote=F)
-
- #return matrix object containing all results
- obj <- rbind(header,output)
- obj <- obj[,-1]
- obj <- obj[-1,]
- obj <- matrix(as.numeric(unlist(obj)),nrow=nrow(obj))
- rownames(obj) <- colnames(Y)
- colnames(obj) <- c(colnames(X),"P-value","Correlation","RMSE")
- obj
- }
- source("sourcecibersort.R") #启动这个函数,必须在哦那个一个文件夹内才可哟
- results <- CIBERSORT(sig_matrix ="LM22.txt", mixture_file ="genes_exp.txt", perm = 1000, QN = T)
- # perm置换次数=1000,QN分位数归一化=TRUE
- # 文件名可以自定义
- # 得到的结果可以用来绘制热图等等
- # install packages 这三个安装不成功的话,就安后面的bseqsc包也行
- install.packages('e1071')
- install.pacakges('parallel')
- install.packages('preprocessCore')
- library(e1071)
- library(preprocessCore)
- library(parallel)
-
- install.packages('devtools')
- library(devtools)
- devtools::install_github('shenorrlab/bseqsc')
- library(bseqsc)#这个包携带大量CIBERSORT的依赖,前三个安装不好可以安装他
-
- ################安装CIBERSORT包##########################################################
- if(!require(CIBERSORT))devtools::install_github("Moonerss/CIBERSORT")
- library(CIBERSORT)
- # 包全部安装完成
-
- # 画热图的包
- install.packages("pheatmap")
- install.packages("ComplexHeatmap")
- library(ggplot2)
- library(pheatmap)
- library(ComplexHeatmap)
安装好以后就可以使用cibersort函数了
- # 同时准备好LM22的TXT文件,注意自己以后的文件要和这个TXT的格式一样
- # 加载CIBERSORT包成功后,系统内部会自带data(LM22)
- data(LM22)
- data(mixed_expr)#TCGA的演示数据,正式情况下就用自己的数据
-
- # 正式开始探索
- # 看5*5的数据
- LM22[1:5,1:5]
- mixed_expr[1:5,1:5]
-
- # 分别定义signature矩阵LM22和我的数据(演示)矩阵mixed_expr
- results <- cibersort(sig_matrix = LM22, mixture_file = mixed_expr)
-
- # 理解一下results的结果
- # 你可以理解为返回了一个列名为细胞类型、行名为样本名的细胞浸润程度(占比)的矩阵
- # 此外result中还会多出三列:
- # P-value: 用来展示去卷积的结果在所有细胞类群中是否具有差异
- # Correlation:参考矩阵与输入矩阵的特征基因相关性
- # RMSE: Root mean squared error,参考矩阵与输入矩阵的特征基因标准差
-
- # heatmap
- # 按行(样本内部)标准化可以看出在各类样本内部,M2浸润程度(占比)最高
- rowscale <- results[,1:ncol(LM22)]#只是相当于备份了一下results
- rowscale <- rowscale[,apply(rowscale, 2, function(x){sum(x)>0})]#删除全是0的列
- pheatmap(rowscale,
- scale = 'row',#按行标准化,不标准化就会按绝对值显示,很诡异
- cluster_col=T,#是否对列聚类,不聚类,坐标轴就按照原来的顺序显示
- cluster_row=F,#是否对行聚类
- angle_col = "315")#调整X轴坐标的倾斜角度
-
- # 各类样本之间也具有自己占比高的特异性免疫细胞
- columnscale <- results[,1:ncol(LM22)]
- columnscale <- columnscale[,apply(columnscale, 2, function(x){sum(x)>0})]#删除全是0的列
- pheatmap(columnscale,
- scale = 'column',
- cluster_col=F,
- cluster_row=T,
- angle_col = "315")
-
- # 堆积比例图
- my36colors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3', '#476D87','#E95C59', '#E59CC4', '#AB3282', '#23452F', '#BD956A', '#8C549C', '#585658','#9FA3A8', '#E0D4CA', '#5F3D69', '#C5DEBA', '#58A4C3', '#E4C755', '#F7F398','#AA9A59', '#E63863', '#E39A35', '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B', '#712820', '#DCC1DD', '#CCE0F5', '#CCC9E6', '#625D9E', '#68A180', '#3A6963','#968175'
- )
- cellnum <- results[,1:ncol(LM22)]
- cell.prop<- apply(cellnum, 1, function(x){x/sum(x)})
- data4plot <- data.frame()
- for (i in 1:ncol(cell.prop)) {
- data4plot <- rbind(
- data4plot,
- cbind(cell.prop[,i],rownames(cell.prop),
- rep(colnames(cell.prop)[i],nrow(cell.prop)
- )
- )
- )
- }
-
- colnames(data4plot)<-c('proportion','celltype','sample')
- data4plot$proportion <- as.numeric(data4plot$proportion)
- ggplot(data4plot,aes(sample,proportion,fill=celltype))+
- geom_bar(stat="identity",position="fill")+
- scale_fill_manual(values=my36colors)+#自定义fill的颜色
- ggtitle("cell portation")+
- theme_bw()+
- theme(axis.ticks.length=unit(0.5,'cm'),axis.title.x=element_text(size=1))+
- theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))+#把x坐标轴横过来
- guides(fill=guide_legend(title=NULL))
LM22:
#########链接:https://pan.baidu.com/s/1eQSEekekozS5osgydwzk1w
#@####提取码:fk88
- LM22read <- read.csv("LM22.csv",header = T)
- gene <- LM22read[,1]
- rownames(LM22read) <- gene
- LM22read <- LM22read[,-1]
- data(LM22)
- all(LM22==LM22read)#可以看到TURE,说明两个文件完全一样了;LM22是上文提到的安装CIBERSORT包之后自带的data
鸣谢:生信技能树jimmy老师和 Biomamba 生信基地 BIOMAMBA老师
有疑问可以邮件联系我,会尽力帮忙:yunbk@mail2.sysu.edu.cn
2023.03.05更新
结合不少小伙伴的私信和邮件更新几个点
1.自己的表达矩阵格式:按照作者的文档和示例数据,应该是不取log的,也不能做其他的normalizie处理。当然如果是log处理过的,cibersort会自己去log,所以也不必担心。数据应该是标准化后的数据比如FPKM TPM CPM这样,不应该用count。
2.在数据量比较大,样本多的时候,请下载R包使用cibersort,因为source函数的读取速度会变得很慢。
3.计算前需要排除空值(NA)值,不然会报错
4.最新的版本是cibersortX,但是好像现在维护有点问题,大家可以自行搜索一下这个方法
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。