重复一篇3分左右纯生信文章(第三部分)

本文目的:一文解决WGCNA分析问题。

原文章使用了自己识别的五个lncRNA,与mRNA合并做WGCNA分析,目的是为了得到lncRNA相关的mRNA。所以这里,我们做WGCNA,所需要的数据可以推测其包括:lncRNA表达量,mRNA表达矩阵,一些临床参数数据。

代码WGCNA_prepare.R(给WGCNA分析做前期数据准备)

# =======================================================
#########################################################
#########################################################
#作者:工程师2号 ########################################
#简书笔记博客(柳叶刀与小鼠标)##########################
# https://www.jianshu.com/u/619b87e54936 ###############
#########################################################
#########################################################
# =======================================================




# =======================================================
#### 下载表达数据################################
# =======================================================


library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)
library(tibble)
rm(list=ls())

setwd('D:\\test\\Five_key_lncRNAs\\chapter1\\download\\expression')

exp <- GDCquery(project = "TCGA-PAAD", 
                data.category = "Transcriptome Profiling", 
                data.type = "Gene Expression Quantification", 
                workflow.type = "HTSeq - FPKM")

GDCdownload(exp)

expdat <- GDCprepare(query =exp)

count_matrix= as.data.frame(assay(expdat))




setwd("D:\\Originaldata\\GRCH\\Homo_sapiens.GRCh38.90")

load("gtf_df.Rda")

test <- gtf_df[1:5,]

View(test)



fpkmToTpm <- function(fpkm)
{
  exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}


expr <- as.data.frame (apply(count_matrix  , 2, fpkmToTpm))


expr <-  expr %>% rownames_to_column("gene_id")


###表达矩阵中提取mRNA表达矩阵

mRNA_exprSet <- gtf_df %>%
  dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>%
  dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%
  dplyr::inner_join(expr,by ="gene_id") %>%
  tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")


save(mRNA_exprSet,file = "mRNA_exprSet.Rda")


# =======================================================
#### 提取mRNA表达矩阵################################
# =======================================================

mRNA_exprSet <- mRNA_exprSet %>%
  tidyr::separate(gene_id, c("gene_name","gene_id","gene_biotype"),
                  sep = " \\| ")


mRNA_exprSet <- mRNA_exprSet[,-(2:3)]


index <- duplicated(mRNA_exprSet$gene_name)
mRNA_exprSet <- mRNA_exprSet[!index,]
row.names(mRNA_exprSet) <- mRNA_exprSet$gene_name
mRNA_exprSet$gene_name <- NULL


###第五节:删除癌旁样本和二次测序的样本
metadata <- data.frame(colnames(mRNA_exprSet))


for (i in 1:length(metadata[,1])) {
  num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
  if (num == 1 ) {metadata[i,2] <- "T"}
  if (num != 1) {metadata[i,2] <- "N"}
}
names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)


metadata <- subset(metadata,metadata$group == "T")
metadata

mRNA_exprSet1 <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% metadata$id)]

metadata <- data.frame(colnames(mRNA_exprSet1))
for (i in 1:length(metadata[,1])) {
  chr <-  as.character(substring(metadata[i,1],22,25))
  if ( chr == 'A277' ) {metadata[i,2] <- "Rep"}
  if ( chr!= 'A277' ) {metadata[i,2] <- "T"}
}

names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)
metadata <- subset(metadata,metadata$group == "T")
metadata


###保存mRNA表达矩阵

mRNA_exprSet2 <- mRNA_exprSet1[,which(colnames(mRNA_exprSet1) %in% metadata$id)]

setwd('D:\\test\\Five_key_lncRNAs\\chapter3')

save(mRNA_exprSet2,file = 'mRNA_exprSet2.Rdata')


# =======================================================
####提取lncRNA表达矩阵###########################
# ======================================================


setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
library(survival)
library(future.apply)
plan(multiprocess)

library(TCGAbiolinks)
library(dplyr)
library(DT)
library(tibble)

rm(list=ls())

survival_data <- read.csv('survival.csv',
                          header = T,row.names = 1)  %>%
  dplyr::select( Barcode,OS  ,OS.Time)



load('mRNA_exprSet2.Rdata')


mRNA_exprSet <- mRNA_exprSet2

colnames(mRNA_exprSet) <- substr(x=colnames(mRNA_exprSet),
                                 start = 1,
                                 stop = 12)

mRNA_exprSet <- as.data.frame(t(mRNA_exprSet))

mRNA_exprSet  <- mRNA_exprSet +0.001

a <- mRNA_exprSet[1:6,1:6]



myfun_cv <-function(x){
  cv<-  sd(x)/mean(x)  
  return(cv) 
}


#挑选变异系数大于0.3的mRNA基因
cv_gene <-  as.data.frame(apply(mRNA_exprSet, 2, myfun_cv))

colnames(cv_gene) <- 'cv'

cv_gene <- subset(cv_gene,cv_gene$cv > 0.3)

mRNA_exprSet <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% rownames(cv_gene))]

mRNA_exprSet$Barcode <- rownames(mRNA_exprSet)

survival_dat <- merge(survival_data, mRNA_exprSet, by='Barcode')


####mRNA单因素分析,因为没必要用所有的mRNA来做WGCNA
#仅仅需要有生存意义的mRNA和我们选定的三个lncRNA即可

colnames(survival_dat) <- sub("\\-", "", colnames(survival_dat))

colnames(survival_dat) <- sub("\\.", "_", colnames(survival_dat))

colnames(survival_dat) <- sub("\\:", "_", colnames(survival_dat))

covariates <- as.character(colnames(survival_dat))[-(1:3)]

univ_formulas <- sapply(covariates,
                        function(x){
                          as.formula(paste('Surv(OS_Time, OS)~', x))})

univ_models <- future_lapply( univ_formulas,
                              function(x){coxph(x, data = survival_dat)})


univ_results <-  lapply(univ_models,
                        function(x){ 
                          x <- summary(x)
                          p.value <- signif(x$wald["pvalue"], digits = 2)
                          beta <- signif(x$coef[1], digits = 2)
                          HR <- signif(x$coef[2], digits = 2)
                          HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                          HR.confint.upper <- signif(x$conf.int[,"upper .95"], 2)
                          HR <- paste0(HR, " (", 
                                       HR.confint.lower, "-", HR.confint.upper, ")")
                          res <- c(beta, HR, p.value)
                          names(res) <- c("coef", "HR (95% CI for HR)", "p.value")
                          return(res)
                        })


res <- as.data.frame(t(do.call(cbind, univ_results)))

res[1:6,]

res$p.value <- as.numeric(as.character(res$p.value))

res$coef <- as.numeric(as.character(res$coef ))

res[1:6,]

table(res$p.value < 0.05)

res  <- res[res$p.value <= 0.05, ]

res  <- res[order(res$p.value), ]

single_gene <- rownames(res)
#single_gene即为得到的单因素cox分析中P小于0.05的基因

sig_genes <- survival_dat[ ,which(colnames(survival_dat) %in% single_gene)]

rownames(sig_genes) <- survival_dat$Barcode

save(sig_genes,file = 'sig_genes.Rdata')
#得到了有生存意义的mRNA表达矩阵


# =======================================================
####合并选定的lncRNA,以及mRNA表达矩阵####################
# =======================================================

library(dplyr)

library(tidyr)

setwd('D:\\test\\Five_key_lncRNAs\\chapter3')

rm(list=ls())

load( 'sig_genes.Rdata')

lncRNA <- read.table('diffmRNAExp.txt',header = T,sep = '\t')


#diffmRNAExp.txt文件是上一节生成的差异基因文件
univariate_data <- read.table('diffmRNAExp.txt',header = T,row.names = 1,
                              sep = '\t')

univariate_data  <- log2(univariate_data +0.001)

metadata <- data.frame(colnames(univariate_data))





for (i in 1:length(metadata[,1])) {
  num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
  if (num == 1 ) {metadata[i,2] <- "T"}
  if (num != 1) {metadata[i,2] <- "N"}
}
names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)


metadata <- subset(metadata,metadata$group == "T")
metadata

exprSet <- univariate_data[,which(colnames(univariate_data) %in% metadata$id)]

colnames(exprSet)  <- substr(x=colnames(exprSet),start = 1,stop = 12)
colnames(exprSet)  <- chartr(old='.',new = '-',x=colnames(exprSet) )

survival <- read.csv('survival.csv',header = T,row.names = 1)




select_colname <- function(expr,name){
  expr <- expr[,which(colnames(expr) %in% name[,1])]
  expr  <- expr[,order(names(expr))]   
  name <- name[which(name[,1] %in% colnames(expr)),]
  name <- name[order(name[,1]),]
  expr_name <- list( expr,name)
  return( expr_name)
}

dat <- select_colname(exprSet,survival )


data <- as.data.frame(t(dat[[1]]))


data$Barcode <- row.names(data)
survival <- survival %>%
  dplyr::select(Barcode,OS,OS.Time,OS.Time,Age,Grade,TNM)


survival_dat  <- merge(survival,data,by='Barcode')

library(stringr)

survival_dat$Grade <- str_extract(survival_dat$Grade,pattern = '\\d')
survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = 'T\\d')
survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = '\\d')


getmode <- function(v){
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v,uniqv)))]
}

# cha <- c('w','w','w','e','e','r')
# num <- c(0,0,1,2,2,3,4,4,4,4,6,6)
# 
# getmode(num)
# getmode(cha)
# 

library(Hmisc)
library(mice)
md.pattern(survival_dat)


survival_dat$TNM <- impute(survival_dat$TNM,getmode)
survival_dat$Grade <- impute(survival_dat$Grade,getmode)


survival_dat <- survival_dat %>%
  dplyr::select( Barcode,OS,OS.Time ,Age,
                 Grade, TNM , MIR202HG,
                 LINC02260,LINC01894 )


md.pattern(survival_dat)

sig_genes$Barcode <- rownames(sig_genes)

data_wgcna <- merge(survival_dat,sig_genes,by='Barcode')

save(data_wgcna,file = 'data_wgcna.Rdata')

#保存用于WGCNA分析的数据,行为样本名
#列为基因或者临床属性
# =======================================================
#########################################################
#########################################################
#作者:工程师2号 ########################################
#简书笔记博客(柳叶刀与小鼠标)##########################
# https://www.jianshu.com/u/619b87e54936 ###############
#########################################################
#########################################################
# =======================================================


还有 65% 的精彩内容

推荐阅读更多精彩内容