在前面系列教程中分享了如何下载TCGA表达矩阵(2022新版TCGA数据下载与整理),通过R进行数据合并就能得到TCGA泛癌表达矩阵了。有时候需要分析基因表达与TMB之间的相关性,就会涉及到TCGA突变数据下载和TMB计算。TCGA突变数据maf以前可以直接在TCGA下载,也可以用TCGAbiolinks包的GDCquery_Maf函数下载,后来TCGA不让下载了,TCGAbiolinks的GDCquery_Maf函数在新版本里面也被抛弃了。但是TCGAbiolinks包的作者提供了替代方案,具体见TCGAbiolinks: Searching, downloading and visualizing mutation files,作者提供了hg38和hg19两个版本的下载方法,以hg38的TCGA-CHOL为例:
library("TCGAbiolinks")
query <- GDCquery(
project = "TCGA-CHOL",
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
得到的maf就可以用于后续TMB计算了,TMB的计算参考这个链接(TCGA 数据分析实战 —— TMB 与免疫浸润联合分析):
get_TMB <- function(file) {
# 需要用到的列
use_cols <- c(
"Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count"
)
# 删除这些突变类型
mut_type <- c(
"5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region"
)
# 读取文件
df <- read_csv(file, col_select = use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 计算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
为了得到泛癌的TMB数据,我将代码写成循环,并稍微做了些调整。
rm(list=ls())
library(TCGAbiolinks)
library(dplyr)
library(stringr)
projects <- getGDCprojects()$project_id
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
projects <- projects[order(projects)]
pan_TMB=list()
for (project in projects){
query <- GDCquery(
project = project,
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
## TMB calculation
# Code Reference https://zhuanlan.zhihu.com/p/394609586
get_TMB <- function(file) {
use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count")
# 删除这些突变类型
mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region")
# 读取文件
df <- select(file, use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 计算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
pan_TMB[[project]]=get_TMB(maf)
}
pan_TMB=do.call(rbind,pan_TMB)
pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
pan_TMB=pan_TMB[,c(5,1,3)]
colnames(pan_TMB)=c("Cancer","ID","TMB")
save(pan_TMB,file="pan_TMB.Rdata")
但是在循环到TCGA-MESO环节,总是会报"Error: Failure to retrieve index 140/0",然后R就重启了。没办法,把TCGA-MESO单独出来,再合并。中间处理TCGA-MESO时在用maftools的read.maf时也有报错"Error in read.maf(x, isTCGA = TRUE) : No non-synonymous mutations found Check vc_nonSyn`` argumet in
read.maffor details",后来按照Github上的方法直接读入表格([No non-synonymous mutations found Check vc_nonSyn argumet in read.maf
for details #838](https://github.com/PoisonAlien/maftools/issues/838)
最终代码如下:
rm(list=ls())
library(TCGAbiolinks)
library(dplyr)
library(stringr)
projects <- getGDCprojects()$project_id
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
projects <- projects[order(projects)]
projects=projects[-19]
pan_TMB=list()
for (project in projects){
query <- GDCquery(
project = project,
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
## TMB calculation
# Code Reference https://zhuanlan.zhihu.com/p/394609586
get_TMB <- function(file) {
use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count")
# 删除这些突变类型
mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region")
# 读取文件
df <- select(file, use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 计算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
pan_TMB[[project]]=get_TMB(maf)
}
# For TCGA-MESO
query <- GDCquery(
project = "TCGA-MESO",
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
# Reference:https://github.com/PoisonAlien/maftools/issues/838
mafFilePath2 = dir(path = "GDCdata/TCGA-MESO",pattern = "masked.maf.gz$",full.names = T,recursive=T)
TCGA_MESO = lapply(mafFilePath2, data.table::fread, skip = "Hugo_Symbol")
TCGA_MESO = data.table::rbindlist(l = TCGA_MESO, use.names = TRUE, fill = TRUE)
pan_TMB[["TCGA-MESO"]]=get_TMB(TCGA_MESO)
pan_TMB=do.call(rbind,pan_TMB)
pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
pan_TMB=pan_TMB[,c(5,1,3)]
colnames(pan_TMB)=c("Cancer","ID","TMB")
pan_TMB$TMB=round(pan_TMB$TMB,1) # 保留一位小数
pan_TMB=pan_TMB[order(pan_TMB$Cancer),]
save(pan_TMB,file="pan_TMB.Rdata")
最终数据如下:
查看下每个肿瘤的突变情况。
table(pan_TMB$Cancer)
# ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LAML LGG LIHC LUAD LUSC MESO OV PAAD PCPG
# 90 414 986 289 51 454 47 184 461 510 66 357 282 128 521 371 616 544 78 462 170 182
# PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM
# 493 161 238 469 431 146 494 121 518 57 80
通过ID可以和TCGA表达矩阵合并,再根据TCGA基因与免疫评分带侧边密度图的相关性点图的方法 就可以进行TMB与基因表达相关性分析了。