Cibersort 算法分析肿瘤样本免疫细胞组分

2015年斯坦福大学医学院的一个研究团队，提出了一种分析单细胞类型的新方法。这种方法类似于分析一杯奶昔，以找到什么水果加入其中。本研究描述的方法称为Cibersort，在线发表于三月三十日的《自然方法》（Nature Methods）

Newman, Aaron M et al. “Robust enumeration of cell subsets from tissue expression profiles.” Nature methods vol. 12,5 (2015): 453-7. doi:10.1038/nmeth.3337

在Pubmed简单搜索，就有 1300+ 的记录，可见该方法很值得学习

CIBERSORTx 新版官网入口：CIBERSORTx 官网https://cibersortx.stanford.edu/

增加了适配大量组织(bulk tissue)样本单细胞测序方面的功能。

CIBERSORTx is an analytical tool from the Alizadeh Lab and Newman Lab to impute gene expression profiles and provide an estimation of the abundances of member cell types in a mixed cell population, using gene expression data. （输入基因表达矩阵，输出样本中的各种细胞类型的丰度。比较有意思的是，CIBERSORT的设计之初的核心目标是“predicting fractions of multiple cell types in gene expression profiles (GEPs)”，可能例子里22个免疫细胞 signature让人太过印象深刻，几乎全互联网的教程都是基于免疫细胞的，CIBERSORT也被认为是专门用来量化免疫浸润的，其实signature是可以按需求自行选择的。）

source.R文件：

#' CIBERSORT R script v1.03 (last updated 07-10-2015)
#' Note: Signature matrix construction is not currently available; use java version for full functionality.
#' Author: Aaron M. Newman, Stanford University (amnewman@stanford.edu)
#' Requirements:
#'       R v3.0 or later. (dependencies below might not work properly with earlier versions)
#'       install.packages('e1071')
#'       install.pacakges('parallel')
#'       install.packages('preprocessCore')
#'       if preprocessCore is not available in the repositories you have selected, run the following:
#'           source("http://bioconductor.org/biocLite.R")
#'           biocLite("preprocessCore")
#' Windows users using the R GUI may need to Run as Administrator to install or update packages.
#' This script uses 3 parallel processes.  Since Windows does not support forking, this script will run
#' single-threaded in Windows.
#'
#' Usage:
#'       Navigate to directory containing R script
#'
#'   In R:
#'       source('CIBERSORT.R')
#'       results <- CIBERSORT('sig_matrix_file.txt','mixture_file.txt', perm, QN)
#'
#'       Options:
#'       i)  perm = No. permutations; set to >=100 to calculate p-values (default = 0)
#'       ii) QN = Quantile normalization of input mixture (default = TRUE)
#'
#' Input: signature matrix and mixture file, formatted as specified at http://cibersort.stanford.edu/tutorial.php
#' Output: matrix object containing all results and tabular data written to disk 'CIBERSORT-Results.txt'
#' License: http://cibersort.stanford.edu/CIBERSORT_License.txt
#' Core algorithm
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
CoreAlg <- function(X, y){#try different values of nusvn_itor <- 3res <- function(i){if(i==1){nus <- 0.25}if(i==2){nus <- 0.5}if(i==3){nus <- 0.75}model<-e1071::svm(X,y,type="nu-regression",kernel="linear",nu=nus,scale=F)model}if(Sys.info()['sysname'] == 'Windows') out <- parallel::mclapply(1:svn_itor, res, mc.cores=1) elseout <- parallel::mclapply(1:svn_itor, res, mc.cores=svn_itor)nusvm <- rep(0,svn_itor)corrv <- rep(0,svn_itor)#do cibersortt <- 1while(t <= svn_itor) {weights = t(out[[t]]$coefs) %*% out[[t]]$SVweights[which(weights<0)]<-0w<-weights/sum(weights)u <- sweep(X,MARGIN=2,w,'*')k <- apply(u, 1, sum)nusvm[t] <- sqrt((mean((k - y)^2)))corrv[t] <- cor(k, y)t <- t + 1}#pick best modelrmses <- nusvmmn <- which.min(rmses)model <- out[[mn]]#get and normalize coefficientsq <- t(model$coefs) %*% model$SVq[which(q<0)]<-0w <- (q/sum(q))mix_rmse <- rmses[mn]mix_r <- corrv[mn]newList <- list("w" = w, "mix_rmse" = mix_rmse, "mix_r" = mix_r)}#' do permutations
#' @param perm Number of permutations
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
doPerm <- function(perm, X, Y){itor <- 1Ylist <- as.list(data.matrix(Y))dist <- matrix()while(itor <= perm){#print(itor)#random mixtureyr <- as.numeric(Ylist[sample(length(Ylist),dim(X)[1])])#standardize mixtureyr <- (yr - mean(yr)) / sd(yr)#run CIBERSORT core algorithmresult <- CoreAlg(X, yr)mix_r <- result$mix_r#store correlationif(itor == 1) {dist <- mix_r}else {dist <- rbind(dist, mix_r)}itor <- itor + 1}newList <- list("dist" = dist)
}#' Main functions
#' @param sig_matrix file path to gene expression from isolated cells
#' @param mixture_file heterogenous mixed expression
#' @param perm Number of permutations
#' @param QN Perform quantile normalization or not (TRUE/FALSE)
#' @export
CIBERSORT <- function(sig_matrix, mixture_file, perm=0, QN=TRUE){#read in dataX <- read.table(sig_matrix,header=T,sep="\t",row.names=1,check.names=F)Y <- read.table(mixture_file, header=T, sep="\t", check.names=F)Y <- Y[!duplicated(Y[,1]),]rownames(Y)<-Y[,1]Y<-Y[,-1]X <- data.matrix(X)Y <- data.matrix(Y)#orderX <- X[order(rownames(X)),]Y <- Y[order(rownames(Y)),]P <- perm #number of permutations#anti-log if max < 50 in mixture fileif(max(Y) < 50) {Y <- 2^Y}#quantile normalization of mixture fileif(QN == TRUE){tmpc <- colnames(Y)tmpr <- rownames(Y)Y <- preprocessCore::normalize.quantiles(Y)colnames(Y) <- tmpcrownames(Y) <- tmpr}#intersect genesXgns <- row.names(X)Ygns <- row.names(Y)YintX <- Ygns %in% XgnsY <- Y[YintX,]XintY <- Xgns %in% row.names(Y)X <- X[XintY,]#standardize sig matrixX <- (X - mean(X)) / sd(as.vector(X))# 矩阵X是LM22矩阵# 矩阵Y是待预测的矩阵# 然后对#empirical null distribution of correlation coefficientsif(P > 0) {nulldist <- sort(doPerm(P, X, Y)$dist)}#print(nulldist)header <- c('Mixture',colnames(X),"P-value","Correlation","RMSE")#print(header)output <- matrix()itor <- 1mixtures <- dim(Y)[2]pval <- 9999#iterate through mixtureswhile(itor <= mixtures){y <- Y[,itor]#standardize mixturey <- (y - mean(y)) / sd(y)#run SVR core algorithmresult <- CoreAlg(X, y)#get resultsw <- result$wmix_r <- result$mix_rmix_rmse <- result$mix_rmse#calculate p-valueif(P > 0) {pval <- 1 - (which.min(abs(nulldist - mix_r)) / length(nulldist))}#print outputout <- c(colnames(Y)[itor],w,pval,mix_r,mix_rmse)if(itor == 1) {output <- out}else {output <- rbind(output, out)}itor <- itor + 1}#save resultswrite.table(rbind(header,output), file="CIBERSORT-Results.txt", sep="\t", row.names=F, col.names=F, quote=F)#return matrix object containing all resultsobj <- rbind(header,output)obj <- obj[,-1]obj <- obj[-1,]obj <- matrix(as.numeric(unlist(obj)),nrow=nrow(obj))rownames(obj) <- colnames(Y)colnames(obj) <- c(colnames(X),"P-value","Correlation","RMSE")obj
}

LM22.txt文件：

展示部分：完整文件可从Nature Methods文章附录文件下载，调整成如下格式即可

Gene symbol	B cells naive	B cells memory	Plasma cells	T cells CD8	T cells CD4 naive	T cells CD4 memory resting	T cells CD4 memory activated	T cells follicular helper	T cells regulatory (Tregs)	T cells gamma delta	NK cells resting
ABCB4	555.7134	10.74424	7.225819	4.31128	4.60586	7.406442	8.043976	6.469993	7.833082	9.312295	13.68284
ABCB9	15.60354	22.09479	653.3923	24.22372	35.67151	30.04819	38.45542	17.60479	46.07366	19.71572	21.86372
ACAP1	215.306	321.621	38.61687	1055.613	1790.097	922.1527	340.8834	1107.798	1995.483	280.0757	512.6369
ACHE	15.11795	16.64885	22.12374	13.42829	27.18773	18.44493	13.44127	14.80554	24.65271	33.65845	16.65374
ACP5	605.8974	1935.201	1120.105	306.3125	744.6566	557.8198	248.5469	711.9497	958.916	493.9691	340.8201
ADAM28	1943.743	1148.12	324.7808	22.68972	40.06171	21.23331	15.20962	24.28141	74.76634	24.37075	193.6094
ADAMDEC1	371.0336	318.4788	127.9674	44.61629	80.9645	47.43351	30.55298	68.73789	156.7077	231.8273	68.27759
ADAMTS3	146.1956	106.0523	74.33917	42.39042	79.2418	57.38591	53.41302	35.41108	76.28062	65.26799	31.43527
ADRB2	486.3438	510.0813	289.7985	899.6485	77.15395	595.2105	142.8319	97.17423	217.0628	2378.452	1957.539
AIF1	24.0743	20.32186	21.96957	742.8158	718.6148	35.17225	39.50876	19.07768	29.36704	1086.96	105.7346
AIM2	327.9581	4538.991	1429.674	294.4846	57.0808	250.2996	3311.978	1115.682	315.6459	407.5929	87.22742
ALOX15	12.23359	13.75324	9.073509	9.774972	18.55656	12.00966	5.239541	13.15225	22.38142	14.45276	30.17685
ALOX5	1799.684	2425.681	190.3315	78.68444	74.9309	132.0101	59.08736	35.00087	143.0134	441.5556	157.0365
AMPD1	263.7451	217.7617	3012.768	66.10749	68.66195	85.31837	68.46141	53.95514	79.26421	114.0495	59.14044
ANGPT4	23.12217	21.31256	230.6807	44.68466	22.06015	44.47513	99.594	101.4456	29.46508	29.25858	9.380756
ANKRD55	284.4127	218.9158	117.2004	141.6981	1853.336	436.4203	79.77497	1077.33	123.8806	176.2439	83.38528
APOBEC3A	102.4292	89.60069	99.594	64.35534	107.2355	52.73721	57.78232	71.37789	36.39007	723.1438	253.3068

第一步：准备R包与CIBERSORT算法

rm(list=ls())   #清空环境变量
options(stringsAsFactors = F)install.packages('e1071')
install.packages('parallel')
BiocManager::install("preprocessCore")source("source.R")

第二步：准备表达矩阵文件，无需进行Log转换，cibersort算法会完成转换


load("CESC_FPKM_tumor.Rda")
ciber_input <- log2(CESC_FPKM_tumor_final + 1)
write.table(ciber_input, file = "cibersort_input.txt",sep = "\t", row.names = T,col.names = NA,quote = F)

第三步： CIBERSORT计算


#CIBERSORT计算
sig_matrix <- "LM22.txt"   #注释文件名
mixture_file = 'cibersort_input.txt'   #表达数据文件名
res_cibersort <- CIBERSORT(sig_matrix, mixture_file, perm=100, QN=TRUE)#用cibersort算法计算
save(res_cibersort,file = "res_cibersort.Rdata")   #保存结果

第四步：可视化展示


#可视化展示
rm(list=ls())
load("res_cibersort.Rdata")
res_cibersort <- res_cibersort[,1:22]   #取前22列为细胞丰度数据
ciber.res <- res_cibersort[,colSums(res_cibersort) > 0]   #去除丰度全为0的细胞#barplot图
mycol <- ggplot2::alpha(rainbow(ncol(ciber.res)), 0.7) #创建彩虹色板（带70%透明度）
par(bty="o", mgp = c(2.5,0.3,0), mar = c(2.1,4.1,2.1,10.1),tcl=-.25,las = 1,xpd = F)
a = barplot(as.matrix(t(ciber.res)),border = NA, # 柱子无边框names.arg = rep("",nrow(ciber.res)), # 无横坐标样本名yaxt = "n", # 先不绘制y轴ylab = "Relative percentage", # 修改y轴名称col = mycol) # 采用彩虹色板
axis(side = 2, at = c(0,0.2,0.4,0.6,0.8,1), # 补齐y轴添加百分号labels = c("0%","20%","40%","60%","80%","100%"))
legend(par("usr")[2]-20, # 这里-20要根据实际出图的图例位置情况调整par("usr")[4], legend = colnames(ciber.res), xpd = T,fill = mycol,cex = 0.8, border = NA, y.intersp = 1,x.intersp = 0.2,bty = "n")
dev.off()   #关闭画板#相关性热图
#install.packages('corrplot')
M <- round(cor(ciber.res),2) # 计算相关性矩阵并保留两位小数library(corrplot)
corrplot.mixed(M,lower.col = "black", #左下方字体颜色为黑色tl.pos = "lt",  #标签出现在左侧和顶部number.cex = 0.5, #左下方字号为0.5tl.cex = 0.5) #标签字号为0.5
dev.off()   #关闭画板

#箱线图
dd1 <- ciber.res %>%as.data.frame() %>%rownames_to_column("sample") %>%pivot_longer(cols = 2:23,names_to = "CellType",values_to = "Composition")plot.info <- dd1[]          ggboxplot(plot.info,x = "CellType",y = "Composition",color = "black",fill = "CellType",xlab = "",ylab = "Cell composition",main = "TME Cell composition") +theme_base() +theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 1))

参考：

作者：又是一只小菜鸟
链接：https://www.jianshu.com/p/add97b7640a4
来源：简书

作者：菠萝西斯
链接：https://blog.csdn.net/u013429737/article/details/116044812
来源：CSDN