前面由于对miRNA的探针数目没有正确的理解,以为数据一直没有下载完全,折腾了一番。后来经老大jimmy提醒了,miRNA有2000左右的探针就是正常的呀。所以就可以愉快地继续进行分析啦!
下面是对数据集GSE85589中的原图进行复现,一篇WGCNA文章的原图就到手了!
1.下载数据+准备数据
rm(list = ls())
options(stringsAsFactors = F)
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
library(GEOquery)
library(WGCNA)
f='GSE85589_eSet.Rdata'
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE106292
# 这个包需要注意两个配置,一般来说自动化的配置是足够的。
#Setting options('download.file.method.GEOquery'='auto')
#Setting options('GEOquery.inmemory.gpl'=FALSE)
if(!file.exists(f)){
gset <- getGEO('GSE85589', destdir=".",
AnnotGPL = F, ## 注释文件
getGPL = F) ## 平台文件
save(gset,file=f) ## 保存到本地
}
load('GSE85589_eSet.Rdata') ## 载入数据
class(gset) #查看数据类型
length(gset) #
class(gset[[1]])
# 因为这个GEO数据集只有一个GPL平台,所以下载到的是一个含有一个元素的list
a=gset[[1]]
dat=exprs(a)
dim(dat)
pd=pData(a)
save(dat,pd,file = 'step1-rawdata')
#######探索match的过程
# load('step1-rawdata')
# probe2symbol <- data.table::fread(file = "GPL19117-74051.txt")#对于这样的一个文件,就这么容易读进去了
#
# probe2symbol <- probe2symbol[grep("Homo sapiens", #挑选出智人的注释信息
# ignore.case = F,
# probe2symbol$`Species Scientific Name`), ]
#
# dim(probe2symbol)
# table(probe2symbol$`Species Scientific Name`)#确保都是hs
# probe2symbol <- probe2symbol[ , c(1,4)]
# names(probe2symbol) <- c("PROBE_ID", "SYMBOL_ID")
#
# dat <- as.data.frame(dat)
# dat[1:4,1:4]
#
# rownames(dat) %in% probe2symbol$PROBE_ID
# table(rownames(dat) %in% probe2symbol$PROBE_ID)
# dat1 <- dat[match(rownames(dat),probe2symbol$PROBE_ID),]
# dat[1:4,1:4]
# dat2 <- dat[match(probe2symbol$PROBE_ID,rownames(dat)),]
# dat1[1:4,1:4]
# dat3 <- na.omit(dat2)
# dat2[1:4,1:4]
#
# ids <- probe2symbol[match(rownames(dat),probe2symbol$PROBE_ID),]
# rownames(dat) <- ids$SYMBOL_ID
# table(duplicated(rownames(dat)))#table后发现false为2578行,正好是dat的行数,说明没有重复项
# #而其实table(duplicated(probe2symbol$SYMBOL_ID))后发现,还是有很多重复的基因名的,只是我们这里作为2578个的基因名是没有重复的/
# table(duplicated(probe2symbol$SYMBOL_ID))
# dat[1:4,1:4]
##############################################################################
load('step1-rawdata')
dat <- as.data.frame(dat)
dat[1:4,1:4]
probe2symbol <- data.table::fread(file = "GPL19117-74051.txt")#对于这样的一个文件,就这么容易读进去了
probe2symbol <- probe2symbol[grep("Homo sapiens", #挑选出智人的注释信息
ignore.case = F,
probe2symbol$`Species Scientific Name`), ]
dim(probe2symbol)
table(probe2symbol$`Species Scientific Name`)#确保都是hs
probe2symbol <- probe2symbol[ , c(1,4)]
names(probe2symbol) <- c("PROBE_ID", "SYMBOL_ID")
ids <- probe2symbol
head(ids)
colnames(ids)=c('probe_id','symbol')
ids=ids[ids$symbol != '',]
ids=ids[ids$probe_id %in% rownames(dat),]
dat[1:4,1:4]
dat=dat[ids$probe_id,]
dat[1:4,1:4]
ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#对ids$symbol按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果s
dat=dat[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
dat[1:4,1:4] #保留每个基因ID第一次出现的信息
colnames(dat)
pdac <- grep('PC',pd$title)
normal <- grep('N[0-9]+\\b',pd$title)
x <- c(pdac,normal)
expr <- dat[,x]
#对pd_fil进行去除冗余项
length(unique(pd[,1]))
length(unique(pd[,2]))
length(unique(pd[,3]))
apply(pd,2,function(x){length(unique(x))})
apply(pd,2,function(x){
length(unique(x))> 1
})
pd_fil <- pd[,apply(pd,2,function(x){
length(unique(x))> 1
})]
dim(pd_fil)
cli <- pd_fil[x,]
save(expr,cli,file = 'step1-input.Rdata')
2.做WGCNA分析
library(WGCNA)
rm(list = ls())
load('step1-input.Rdata')
#####step 1
datExpr0=as.data.frame(t(expr))
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
if(T){
sampleTree = hclust(dist(datExpr0), method = "average")
#sizeGrWindow(15,12)
par(cex = 0.6)
par(mar = c(0,4,2,0))
#png("dynamicColors_mergedColors.png",width = 800,height = 600)
pdf('Sample clustering.pdf',width = 25,height =20 )
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
abline(h = 26.5, col = "red")
dev.off()
}
#这个仅留做记录,原文并没有去掉离群样本
clust = cutreeStatic(sampleTree, cutHeight = 26.5, minSize = 10)
table(clust) # 0代表切除的,1代表保留的
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
下面是构建无尺度网络
###step 2
if(T){
powers = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#设置网络构建参数选择范围,计算无尺度分布拓扑矩阵
png("step2-beta-value.png",width = 800,height = 600)
# Plot the results:
##sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
}
sft
sft$powerEstimate
save(sft,file = "step2_beta_value.Rdata")
如下原文所示,用的贝塔值是1,我的结果也是,然后R^2值是0.89,接下来选择==一步法==
###step 3
if(T){
cor <- WGCNA::cor
if(T){
net = blockwiseModules(
datExpr,
power = 1,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = F,
verbose = 3
)
table(net$colors)
}
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
pdf('step3-dynamicColors_mergedColors.pdf',width = 25,height =20 )
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "AS-green-FPKM-02-networkConstruction-auto.RData")
}
###step 4
if(T){
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#首先针对样本做个系统聚类
datExpr_tree<-hclust(dist(datExpr), method = "average")
#针对前面构造的样品矩阵添加对应颜色
sample_colors1 <- numbers2colors(as.numeric(factor(datTraits$group_list)),
colors = c("green","blue","red","yellow","black"),signed = FALSE)
ssss=as.matrix(data.frame(group_list=sample_colors1))
par(mar = c(1,4,3,1),cex=0.8)
pdf('step4-sample-subtype-cluster.pdf',width = 25,height =20 )
plotDendroAndColors(datExpr_tree, ssss,
groupLabels = colnames(sample),
cex.dendroLabels = 0.8,
marAll = c(1, 4, 3, 1),
cex.rowText = 0.01,
main = "Sample dendrogram and trait heatmap")
dev.off()
}
##step 5
table(datTraits)
if(T){
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
design1=model.matrix(~0+as.factor(datTraits$group_list))
design=design1
colnames(design)
colnames(design)=c(levels(as.factor(datTraits$group_list)) )
moduleColors <- labels2colors(net$colors)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0); ##不同颜色的模块的ME值矩 (样本vs???
moduleTraitCor = cor(MEs, design , use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
png("step5-Module-trait-relationships.png",width = 800,height = 1200,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
table( labels2colors(net$colors))
}
<img src="https://tva1.sinaimg.cn/large/006y8mN6gy1g9600thqfxj30no0xcq69.jpg" alt="image-20191121214946750" style="zoom:50%;" />
###step 6
###############首先计算??橛牖虻南喙匦跃卣?# 把各个module的名字提取出来(从第三个字符开始),用于一会重命名
modNames = substring(names(MEs), 3)
# 得到矩阵
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
# 矩阵t检验
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
# 修改列名
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
################再计算性状与基因的相关性矩阵
PC <- as.data.frame(design[,2])
names(PC) = "PC"
# 得到矩阵
geneTraitSignificance = as.data.frame(cor(datExpr, PC, use = "p"))
# 矩阵t检验
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
# 修改列名
names(geneTraitSignificance) = paste("GS.", names(PC), sep="")
names(GSPvalue) = paste("p.GS.", names(PC), sep="")
###############最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
###turquoise???module = "turquoise"
column = match(module, modNames)#找到目标??樗诹?moduleGenes = moduleColors==module#找到模块基因所在行
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for PC",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
###brown???module = "brown"
column = match(module, modNames)#找到目标??樗诹?moduleGenes = moduleColors==module#找到??榛蛩谛?sizeGrWindow(7, 7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for PC",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
###step 7
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 只有连续型性状才能只有计算
## 这里把是否属于 Luminal 表型这个变量用0,1进行数值化。
PC = as.data.frame(design[,2]);
names(PC) = "PC"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, PC))
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## ??榈木劾嗤?plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与??槿韧?plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
文章中的原图
最后友情宣传生信技能树
全国巡讲:R基础,Linux基础和RNA-seq实战演练 : 预告:12月28-30长沙站