WGCNA分析，单细胞组学（TCGA）R语言实践全流程

1.WGCNA基本概念

加权基因共表达网络分析 (WGCNA, Weighted correlation network analysis)是用来描述不同样品之间基因关联模式的系统生物学方法，可以用来鉴定高度协同变化的基因集,并根据基因集的内连性和基因集与表型之间的关联鉴定候补生物标记基因或治疗靶点。相比于只关注差异表达的基因，WGCNA利用数千或近万个变化最大的基因或全部基因的信息识别感兴趣的基因集，并与表型进行显著性关联分析。一是充分利用了信息，二是把数千个基因与表型的关联转换为数个基因集与表型的关联，免去了多重假设检验校正的问题。

基本分析流程如下

构建基因共表达网络：使用加权的表达相关性。
识别基因集：基于加权相关性，进行层级聚类分析，并根据设定标准切分聚类结果，获得不同的基因模块，用聚类树的分枝和不同颜色表示。
如果有表型信息，计算基因模块与表型的相关性，鉴定性状相关的模块。
研究模型之间的关系，从系统层面查看不同模型的互作网络。
从关键模型中选择感兴趣的驱动基因，或根据模型中已知基因的功能推测未知基因的功能。
导出TOM矩阵，绘制相关性图。

参考链接：https://www.jianshu.com/p/e9cc3f43441d

实践：

1.所需数据：GDC,TCGA官网所下载的数据（GDC）

得到WGCNA分析所需的TPM或者FPKM格式，或者经各种方式归一化的数据。

2.建立工作路径，清空环境变量，加载包

setwd("C:/Users/Graduate/Desktop/cr")
rm(list = ls())
#1.基因聚类并绘制热图
if (!requireNamespace("BiocManager", quietly = TRUE))install.packages("BiocManager")
BiocManager::install("impute")
library(WGCNA)
library(reshape2)
library(stringr)

3.预处理数据，counts转TPM格式

#预处理 counts转TPM----
library(rtracklayer)
library(tidyverse)
gtf=rtracklayer::import("C:/Users/Graduate/Desktop/gencode.v43.basic.annotation.gtf.gz")
class(gtf)
gtf = as.data.frame(gtf)
dim(gtf)
table(gtf$type)
#挑选基因的最长转录本
library(tidyverse)
tra = gtf[gtf$type=="transcript",c("start","end","gene_name")]length(unique(tra$gene_name))glt = mutate(tra, length = end - start) %>%arrange(desc(length)) %>% filter(!duplicated(gene_name)) %>% select(c(3,4))head(glt)write.csv(glt,'gene_最长转录本长度.csv')#count值转TPM
library(xlsx)
library(readxl)
exp<- read.csv("Gene Symbol_matrix1未匹配.csv",header = T)#读取基因文件
input<- read.csv("C:/Users/Graduate/Desktop/cr/gene_最长转录本长度.csv",row.names = 1)
#自己在Excel中把网盘里的txt文件基因和长度共两列提取出来
library(dplyr)
merge<-merge(exp,input,by="gene_name")#根据基因那列进行合并
merge <- na.omit(merge)#删除错误值行
write.csv(merge,file = "merge.csv")#读出文件，直接往下运行也行#2.计算TPM
mycounts<-read.csv("merge.csv",check.names = F,row.names = 1)
head(mycounts)
rownames(mycounts)<-mycounts$gene_name
mycounts<-mycounts[,-1]
head(mycounts)#最后一列Length是基因长度#TPM计算
kb <- mycounts$length / 1000
kb
countdata <- mycounts[,1:614]
rpk <- countdata / kb
rpk
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
head(tpm)
write.csv(tpm,file="82_KIRC_tpm.csv")
#TPM转FPKM----
fpkm <- t(t(rpk)/colSums(tpm) * 10^6)
nrow(fpkm)
nrow(tpm)
#标准化转换后基因数不变
write.csv(fpkm,file = "./差异基因fpkm.csv")

4.读取表达矩阵

#1.1读取express，去除表达量为0的样本
expres=read.csv("差异基因fpkm.csv",header = T)
#expres=read.table('Annotated_TCGA_matrix.txt',sep="\t",row.names=1,header=T,check.names=F,quote="!")expres <- aggregate( . ~ symbol_id,data=expres, max)
row.names(expres) <- expres[,1]
expres <- expres[,-1]
expres=expres[which(rowSums(expres)>0),]
expres=expres[order(rowSums(expres),decreasing = T),]##1.2剔除正常样本，留下肿瘤样本
expres[]=lapply(expres, as.numeric)
tumor <- colnames(expres)[as.integer(substr(colnames(expres),14,15)) < 10] #样本名的14、15位为1-9的是肿瘤样本,>=10为正常或癌旁样本
tumor_sample <- expres[,tumor]
expres <- tumor_sample
datExpr=t(expres)
datExpr
#####1.3样本聚类
sampleTree = hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
####
powers = c(c(1:10), seq(from = 12, to=20, by=2))sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)##sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;

5.软阈值选择

######Powerz值选择beta1
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");#######
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
power_select=sft[["powerEstimate"]]
power=power_select
power_select
######module detection
##一步法网络构建：One-step network construction and module detection##
# power: 上一步计算的软阈值
# maxBlockSize: 计算机能处理的最大模块的基因数量 (默认5000)；
#  4G内存电脑可处理8000-10000个，16G内存电脑可以处理2万个，32G内存电脑可
#  以处理3万个
#  计算资源允许的情况下最好放在一个block里面。
# corType: pearson or bicor
# numericLabels: 返回数字而不是颜色作为模块的名字，后面可以再转换为颜色
# saveTOMs：最耗费时间的计算，存储起来，供后续使用
# mergeCutHeight: 合并模块的阈值，越大模块越少

6.设置参数

####参数# 为与原文档一致，故未修改
type = "unsigned"
# 相关性计算
# 官方推荐 biweight mid-correlation & bicor
# corType: pearson or bicor
# 为与原文档一致，故未修改
corType = "pearson"
cor <- WGCNA::cor
corFnc = ifelse(corType=="pearson",cor,bicor)
# 对二元变量，如样本性状信息计算相关性时，
# 或基因表达严重依赖于疾病状态时，需设置下面参数
maxPOutliers = ifelse(corType=="pearson",1,0.05)
# 关联样品性状的二元变量时，设置
robustY = ifelse(corType=="pearson",T,F)
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

7.网络的构建，以无向网络为例

cor <- WGCNA::cor
net = blockwiseModules(datExpr, power = power, maxBlockSize = nGenes,TOMType = type, minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs=TRUE, corType = corType, maxPOutliers=maxPOutliers, loadTOMs=TRUE,saveTOMFileBase = paste0("AS-green-FPKM-TOM"),verbose = 3)
save(net,datExpr,file = './WGCNA.Rdata')
cor<-stats::cor
table(net$colors)
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意，还可以recutBlockwiseTrees，节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
######
ADJ= adjacency(datExpr,power = power_select)
vis=exportNetworkToCytoscape(ADJ,edgeFile="edge1.txt",nodeFile="node.txt",threshold = 0.4)
##############箱线图
# module eigengene, 可以绘制线图，作为每个模块的基因表达趋势的展示
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap", marDendro = c(3,3,2,4),marHeatmap = c(3,4,2,2), plotDendrograms = T, xLabelsAngle = 90)
save(plotEigengeneNetworks,file = './marHeatmapWGCNA.Rdata')########TOM PLOT
load(net$TOMFiles[1], verbose=T)## Loading objects:
##   TOM
TOM <- as.matrix(TOM)dissTOM = 1-TOM
# Transform dissTOM with a power to make moderately strong
# connections more visible in the heatmap
plotTOM = dissTOM^7
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
# Call the plot function# 这一部分特别耗时，行列同时做层级聚类
TOMplot(plotTOM, net$dendrograms, moduleColors, main = "Network heatmap plot, all genes")
save(TOMplot,file = '.plotTOMWGCNA.Rdata')
######网络图
probes = colnames(datExpr)
dimnames(TOM) <- list(probes, probes)# Export the network into edge and node list files Cytoscape can read
# threshold 默认为0.5, 可以根据自己的需要调整，也可以都导出后在
# cytoscape中再调整
cyt = exportNetworkToCytoscape(TOM,threshold = 0,nodeNames = probes, nodeAttr = moduleColors)
cyt$edgeData
cyt$nodeData

8.基因与患者表型，性状关联

#2.患者表型、性状与基因模块相关联
clinical=read.csv("clinical.csv",header = T)
#2.1读取clinical
#clinical=read.table("clinical.txt",sep = "\t",header = T)#clinical$days_to_death[is.na(clinical$days_to_death)]=0
#clinical$days_to_last_followup[is.na(clinical$days_to_last_followup)]=0
#clinical$surv_time=clinical$days_to_last_followup+clinical$days_to_deathclinical=clinical[-which(clinical$surv_time==0),]
####去重
clinical=clinical[!duplicated(clinical$TCGA_id),]
k=rownames(datExpr)
k=substr(k,1,12)
row.names(datExpr)=k
datExpr=datExpr[!duplicated(row.names(datExpr)),]
row=match(row.names(datExpr),clinical$TCGA_id)
dcli=clinical[row,]
dcli=dcli[!duplicated(dcli$TCGA_id),]
dcli = na.omit(dcli)
rownames(dcli)=dcli$TCGA_id
dcli=dcli[,-1]
dclic=subset(dcli,select = c('age_at_index','gender','surv_time','EMT_group','ajcc_pathologic_stage'))
##############样本临床数据聚类
library(flashClust)
A = adjacency(t(datExpr), type = "distance")
# this calculates the whole network connectivity
k = as.numeric(apply(A, 2, sum)) - 1
# standardized connectivity
Z.k = scale(k)thresholdZ.k = -5  # often -2.5
sampleTree =  flashClust(as.dist(1 - A), method = "average")
# Convert traits to a color representation: where red indicates high
# values
traitColors = data.frame(numbers2colors(dclic, signed = FALSE))
dimnames(traitColors)[[2]] = names(dclic)
outlierColor = ifelse(Z.k < thresholdZ.k, "red", "blue")
datColors = data.frame(outlier = outlierColor, traitColors)plotDendroAndColors(sampleTree, groupLabels = names(datColors), colors = datColors, main = "Sample dendrogram and trait heatmap",cex.colorLabels = 1,cex.rowText = 0.4,cex.dendroLabels = 0.1)

9.关联表型数据，绘制热图

####2.2关联表型数据
moduleLabelsAutomatic = net$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsFemale = moduleColorsAutomatic
# Define numbers of genes and samples
t_rt=datExpr
nGenes = ncol(t_rt)
nSamples = nrow(t_rt)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(t_rt, moduleColorsFemale)$eigengenes
MEsFemale = orderMEs(MEs0)modTraitCor = cor(MEsFemale, dclic, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
# Since we have a moderately large number of modules and traits, a
# suitable graphical representation will help in reading the table. We
# color code each association by the correlation value: Will display
# correlations and their p-values
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
par(mar = c(4, 8, 1, 2))# Display the correlation values within a heatmap plot
#tiff('c.tiff',width = 4000,height = 2700,res = 300)
labeledHeatmap(Matrix = modTraitCor, xLabels = c('age_at_index','gender','surv_time','EMT_group','ajcc_pathologic_stage'),yLabels = names(MEsFemale), ySymbols = names(MEsFemale), colorLabels = FALSE, colors = blueWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.6, zlim = c(-1, 1), main = paste("Module-trait relationships"),font.lab.x = 0.5,font.lab.y = 0.5)save(labeledHeatmap,file = './labeledHeatmapWGCNA.Rdata')

10.基因与性状的关联矩阵

#####基因模块结果
blocknumber=1
datColors = data.frame(moduleColorsAutomatic)[net$blockGenes[[blocknumber]], ]
datKME = signedKME(t_rt, MEsFemale)
datSummary=rownames(expres)
datout=data.frame(datSummary,datColors,datKME )
datout######## 计算性状与基因的相关性矩阵## 只有连续型性状才能进行计算，如果是离散变量，在构建样品表时就转为0-1矩阵。if (corType=="pearson") {geneModuleMembership = as.data.frame(cor(datExpr, MEs_col, use = "p"))MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
} else {geneModuleMembershipA = bicorAndPvalue(datExpr, MEs_col, robustY=robustY)geneModuleMembership = geneModuleMembershipA$bicorMMPvalue   = geneModuleMembershipA$p
}
if (corType=="pearson") {geneTraitCor = as.data.frame(cor(datExpr, dclic, use = "p"))geneTraitP = as.data.frame(corPvalueStudent(as.matrix(geneTraitCor), nSamples))
} else {geneTraitCorA = bicorAndPvalue(datExpr, dclic, robustY=robustY)geneTraitCor = as.data.frame(geneTraitCorA$bicor)geneTraitP   = as.data.frame(geneTraitCorA$p)
}#######
table(net$colors)
module = "turquoise"
pheno = "EMT_group"
modNames = substring(colnames(MEs_col), 3)
# 获取关注的列
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(dclic))
# 获取模块内的基因
moduleGenes = moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
# 与性状高度相关的基因，也是与性状相关的模型的关键基因
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),abs(geneTraitCor[moduleGenes, pheno_column]),xlab = paste("Module Membership in", module, "module"),ylab = paste("Gene significance for", pheno),main = paste("Module membership vs. gene significance\n"),cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

WGCNA分析，单细胞组学（TCGA）R语言实践全流程相关推荐

林学菜鸟---R语言点格局分析
林学菜鸟-R语言点格局分析本人是R语言菜鸟一枚,有不足的地方希望大佬们指教.这段时间忙着写毕业论文,因为论文的一部分内容涉及到树种的分布格局,所以自己鼓捣了一段时间.下面是我用R语言(Rstudio ...
Nature Metabolism I 衰老的单细胞组学研究进展及展望
单细胞组学和衰老研究的二三事 NGS系列文章包括NGS基础.转录组分析 (Nature重磅综述|关于RNA-seq你想知道的全在这).ChIP-seq分析 (ChIP-seq基本分析流程).单细胞测序 ...
《数据科学R语言实践：面向计算推理与问题求解的案例研究法》一一2.3　数据清洗和变量格式化...
本节书摘来自华章计算机<数据科学R语言实践:面向计算推理与问题求解的案例研究法>一书中的第2章,第2.3节,作者:[美] 德博拉·诺兰(Deborah Nolan) 邓肯·坦普·朗(Dun ...
《数据科学R语言实践：面向计算推理与问题求解的案例研究法》一一2.4　探索所有男选手的跑步时间...
本节书摘来自华章计算机<数据科学R语言实践:面向计算推理与问题求解的案例研究法>一书中的第2章,第2.4节,作者:[美] 德博拉·诺兰(Deborah Nolan) 邓肯·坦普·朗(Dun ...
《数据科学R语言实践：面向计算推理与问题求解的案例研究法》一一2.6　对个人跑步时间的变化进行建模...
本节书摘来自华章计算机<数据科学R语言实践:面向计算推理与问题求解的案例研究法>一书中的第2章,第2.6节,作者:[美] 德博拉·诺兰(Deborah Nolan) 邓肯·坦普·朗(Dun ...
《数据科学R语言实践：面向计算推理与问题求解的案例研究法》一一2.1　引言...
本节书摘来自华章计算机<数据科学R语言实践:面向计算推理与问题求解的案例研究法>一书中的第2章,第2.1节,作者:[美] 德博拉·诺兰(Deborah Nolan) 邓肯·坦普·朗(Dun ...
R语言︱H2o深度学习的一些R语言实践——H2o包
每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- R语言H2o包的几个应用案例笔者寄语:受启发 ...
泊松回归与类泊松回归（《R语言实践（第二版）》）
泊松回归与类泊松回归 0. 背景介绍 0.1 泊松回归: 0.2 本示例要做的事: 1.导包 2.导入数据 3.作图 4. 拟合泊松回归: 4.1解释模型参数 4.2 检验过度离势(Overdispe ...
《数据科学R语言实践：面向计算推理与问题求解的案例研究法》一一 1.1　引言...
本节书摘来自华章计算机<数据科学R语言实践:面向计算推理与问题求解的案例研究法>一书中的第1章,第1.1节,作者:[美] 德博拉·诺兰(Deborah Nolan) 邓肯·坦普·朗(Dun ...

WGCNA分析，单细胞组学（TCGA）R语言实践全流程

1.WGCNA基本概念

WGCNA分析，单细胞组学（TCGA）R语言实践全流程相关推荐

最新文章

热门文章