差异表达基因提取limma+WGCNA分析全代码
用limma包和WGCNA包进行RNA-seq数据分析
#数据提取#
GE<-read.table('TCGA-COAD.htseq_counts.tsv',header=T,sep='\t',stringsAsFactors = F)
#60488*513 512个样本,其中对照组41个
# group_data<-data.frame(colnames(GE)[-1])
group<-rep('tumor',512)
group[grep("11A", colnames(GE)[-1], ignore.case = FALSE, perl = FALSE)]<-"normal"
# group_data<-cbind(group_data,group)
# colnames(group_data)<-c("sample","group")
#提取差异表达基因
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("limma")
library("limma")
#制作分组矩阵
design <- model.matrix(~0+factor(group))
colnames(design)=levels(factor(group))
rownames(design)=colnames(GE)[-1]
head(design)
exprSet<-GE[,-1]
rownames(exprSet)<-GE[,1]
# limma包中voom()函数用于标准化RNA-Seq或芯片数据。该函数将计数(所有计数加0.5以避免对数取零)转换为
#ogCPM值矩阵进行标准化。通过估计对数计数的均值-方差趋势,然后根据其预测方差为每个观察值分配权重,
#用于在线性建模过程中使用权重来调整异方差。
# 数据标准化后,根据每个基因的平均log2CPM构建线性模型,计算残差,并拟合log2CPM与残差标准差平方根的关系,得到的趋势线(红色平滑曲线)可为样本和基因分配权重。
# 对于该趋势线,若左侧0起点处所示残差标准差明显偏高,或者0起点处出现上式趋势,
# 则表明数据中存在较多的低表达(low counts数)基因。若觉得它们可能会对后续的计算带来较大的误差,不妨在标准化前手动过滤下。
norm <- voom(exprSet, design, plot = TRUE)
tag<-apply(exprSet,1,function(x) {if (length(which(x==0))>512*0.8) {re=0#去除在少于50%的样本中表达的基因}else {re=1}
})
table(tag)
exprSet<-exprSet[which(tag==1),]#36169*512 基因数降低到不到三万
norm <- voom(exprSet, design, plot = TRUE)
#标准化前后数据比较
par(mfrow = c(2, 2))
boxplot(exprSet)
plotDensities(exprSet)
boxplot(norm$E)
plotDensities(norm$E)
#线性拟合,详见 ?lmFit
fit <- lmFit(norm, design, method = 'ls')
#确定比较的两组
#后续将计算标记为 1 的组相对于 -1 的组,基因表达值的上调/下调状态
contrast <- makeContrasts('treat-control', levels = design)
contrast
#使用经验贝叶斯模型拟合标准误差,详见 ?eBayes
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)
#p 值校正、提取差异分析结果,详见 ?topTable
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'fdr')
head(diff_gene, 10)
write.table(diff_gene, 'gene_diff.txt', col.names = NA, sep = '\t', quote = FALSE)
#这里根据 |log2FC| >= 1 & FDR p-value < 0.01 定义“差异”
diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'red'
diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'blue'
diff_gene[which(abs(diff_gene$logFC) < 1 | diff_gene$adj.P.Val >= 0.01),'sig'] <- 'gray'
log2FoldChange <- diff_gene$logFC
FDR <- diff_gene$adj.P.Val
plot(log2FoldChange, -log10(FDR), pch = 20, col = diff_gene$sig)
abline(v = 1, lty = 2)
abline(v = -1, lty = 2)
abline(h = -log(0.01, 10), lty = 2)
upGenes<-diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),]#549
downGenes<-diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),]#1563
sig_genes<-rbind(upGenes,downGenes)#2112
dim(upGenes)
dim(downGenes)
dim(sig_genes)#读入注释文件
gencode<-read.table('gencode.v22.annotation.gene.probeMap',header = T,sep = '\t',stringsAsFactors = F)
FPKM<-read.table('TCGA-COAD.htseq_fpkm.tsv',header=T,sep='\t',stringsAsFactors=F)
sig_genes<-data.frame(rownames(sig_genes))
colnames(sig_genes)<-'Ensembl_ID'
#先提取出感兴趣基因的表达
sigE<-merge(FPKM,sig_genes,by.x = 'Ensembl_ID',by.y = 'Ensembl_ID')
#转换基因ID#
sigSymbolE<-merge(gencode[,1:2],sigE,by.x='id',by.y='Ensembl_ID')
sigSymbolE<-sigSymbolE[,-1]
#处理多个探针对应一个基因的情况#
probe2gene = aggregate(sigSymbolE[,-1], by=list(sigSymbolE[,"gene"]),mean)#2107
write.table(probe2gene,'DEG.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.# #deal with the phenotype and survival data
# survival<-read.table('TCGA-OV.survival.tsv',header=T,stringsAsFactors=F)
# #phenotype<-read.table('TCGA-OV.GDC_phenotype.tsv',header=T,stringsAsFactors=F)#有问题,先放这儿
# samples<-data.frame(colnames(probe2gene)[-1])
# samples<-data.frame(apply(samples,1,function(x){re=gsub('.','-',x,fixed=T)}))
# colnames(samples)<-'sample ID'
# survival1<-merge(samples,survival,by.x='sample ID',by.y='sample',all.x=T)#只有378个患者有相应的生存数据,因此剔除表达谱中该患者的数据
# survival1<-survival1[-9,]# index<-which(samples=='TCGA-04-1357-01A')
# probe2gene<-probe2gene[,-(index+1)]
# #更新输出
# write.table(probe2gene,'sig_genes_exp.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.
save.image('.Rdata')#WGCNA分析
###WGCNA###########
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("WGCNA")
#导入的时候可能存在部分依赖包没有导进来的情况,把这些包单独导进来就好
#如果因为网络原因一直无法成功下载,试试本地安装
############################## #首先构造表型相关矩阵
# survival<-read.table('TCGA-OV.survival.tsv',header=T,sep='\t')
# dim(survival)
# survival<-survival[,c(1,4)]
# survival[,1]<-as.data.frame(gsub('-','.',survival[,1]))
# samples_name<-as.data.frame(row.names(datExpr))
# colnames(samples_name)<-'samples'
# survival_OS<-merge(survival,samples_name,by.y='samples',by.x='sample',all.y=T)
# #缺少TCGA.04.1357.01A的临床数据,因此把它删掉###start##library(WGCNA)
library(stringr)
library(flashClust)
library(iterators)femData <- read.table("DEG.txt",header=TRUE,stringsAsFactors=F) #加载数据
datExpr0 = as.data.frame(t(log2(femData[,-1]+1)))#这里我用的时FPKM的数据,根据WGCNA的官方说明,需要对其进行log2(X+1)转换
colnames(datExpr0) = femData[,1] #列标签添加基因名称
rownames(datExpr0) = names(femData[,-1]) #行标签添加为样本名称
# datExpr<-datExpr[-which(row.names(datExpr)=='TCGA.04.1357.01A'),]
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK #结果为TRUE,则所有选定基因都用于后续WGCNA
#datExpr=datExpr[gsg$goodGenes] #如果gsg$allOK结果为FALSE,则后续选择好的基因用于WGCNA。sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
# pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 32, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 32, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#The variable datExpr now contains the expression data ready for network analysis.# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")net = blockwiseModules(datExpr, power = 6,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "coadTOM", verbose = 3)table(net$colors)
# open a graphics window
sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];#加入表型信息-肿瘤还是正常,进行分析
sample<-rownames(datExpr)
tumor<-rep(1,length(sample))
normal<-rep(0,length(sample))
tumor[grep("11A", sample)]<-0
normal[grep("11A", sample)]<-1
table(tumor)
table(normal)
datTraits<-data.frame(tumor,normal)
rownames(datTraits)<-sample# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,xLabels = names(datTraits),yLabels = names(MEs),ySymbols = names(MEs),colorLabels = FALSE,colors = greenWhiteRed(50),textMatrix = textMatrix,setStdMargins = FALSE,cex.text = 0.5,zlim = c(-1,1),main = paste("Module-trait relationships"))# Define variable weight containing the weight column of datTrait
# weight = as.data.frame(datTraits$tumor);
# names(weight) = "weight"
# # names (colors) of the modules
# modNames = substring(names(MEs), 3)# geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
# MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));# names(geneModuleMembership) = paste("MM", modNames, sep="");
# names(MMPvalue) = paste("p.MM", modNames, sep="");# geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
# GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));# names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
# names(GSPvalue) = paste("p.GS.", names(weight), sep="");# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
关于WGCNA包中还有更丰富的功能,请参见Bioconductor中的官方说明。
差异表达基因提取limma+WGCNA分析全代码相关推荐
- WGCNA分析 | 全流程代码分享 | 代码二
– 关于WGNCA的教程,本次的共有三期教程,我们同时做了三个分析的比较,差异性相对还是比较大的,详情可看WGCNA分析 | 你的数据结果真的是准确的吗??,这里面我们只是做了输出图形的比较差异,具体 ...
- 用R语言做WGCNA分析全步骤一(附代码解读)【转载】
代码逐句分析 一.文章来源 二.基因共表达网络构建及模块识别 1.数据导入.清洗及预处理 2.检查过度缺失值和离群样本 3.聚类做离群样本检测 4.载入临床特征数据 三.自动构建网络及识别模块 1.确 ...
- RNA 2. SCI文章中基于GEO的差异表达基因之 limma
01. 前言 目前随着测序成本的降低,越来越多的文章中都将使用基因的表达数据,那么就会涉及到差异基因,最基础的思路就是获得表达数据,根据临床信息进行分组,利用各种软件计算出差异表达基因,这里主要基于G ...
- AGE-PERIOD-COHORT (APC) 连续变量和二分类变量分析全代码
Age-period-cohort(APC)它的算法原理首先需要明白的是这种算法是针对时间这个维度进行了精雕细琢,它把时间维度进行了针对性的分解,分解为年龄效应,时期效应和队列效应.APC 分析的目的 ...
- WGCNA分析 | 代码一
注意:今天的教程比较长,请规划好你的时间.本文是付费内容,在本文文末有本教程的全部的代码和示例数据. 输出结果 分析代码 关于WGCNA分析,如果你的数据量较大,建议使用服务期直接分析,本地分析可能导 ...
- python基因差异分析_差异表达基因的分析(2)
应学生及个别博友的要求,尽管专业博文点击率和反应均很差,但在去San Diego参加PAG会议之前,还是抽时间给出[R高级教程]的第二专题.专题一给出了聚类分析的示例,本专题主要谈在表达谱芯片分析中如 ...
- 差异表达基因分析[转载]
转自:https://wenku.baidu.com/view/2532ab5176c66137ef06191a.html 1.转录组 2.转录组研究重要性 3.转录组研究技术 3.1三种的比较 3. ...
- 差异表达基因火山图(ggplot函数)
1. 读入数据 差异表达基因来自limma分析结果. # read the file data <- read.csv("diff_expr_genes.csv",row.n ...
- limma包分析差异表达基因
limma利用广义线性模型进行差异表达基因分析,主要用于分析微阵列数据. Data analysis, linear models and differential expression for mi ...
最新文章
- 程序员因重复记录日志撑爆ELK被辞退!
- 监控Spark应用方法简介
- Java知识系统回顾整理01基础05控制流程07结束外部循环
- Spring Boot YAML配置
- .NET NPOI导出Excel详解
- 光源时间_缩短背光源的使用寿命的原因
- 在linux怎样删除文件夹里,linux删除文件夹(里头有文件)
- SQL中的CASE使用方法
- vm ubuntu设置中文_如何在本地Ubuntu Linux机器或VM上设置LAMP服务器
- 兰州大学计算机调剂2020,兰州大学2020考研调剂公告
- Android自定义ViewGroup、自定义属性及自定义View
- atitit 高扩展性解决方案.docx
- 3d文件格式转换工具
- Arduino UNO测量电容值
- python列表的操作
- 小程序报错类—— thirdScriptError sdk uncaught third Error Cannot read property '$mount' of unde
- php微信摇一摇开发文档,微信摇一摇页面管理
- 爱普生Epson WorkForce WF-7725 一体机驱动
- 我要的仅此而已:伤感QQ心情日志
- java输出数学和英语成绩_java 计算班里每个同学3门课(英语、数学、数据库)的平均成绩和总成绩,编写一个成绩类来实现这些功能。...