R语言~高级PCA分析

主成分分析,不只需要找到几个虚拟但不知具体为何物的主成分因子,还需要知道我们提供的变量在各个主成分因子上的载荷(有正有负)和贡献度,以及具体保留多少个主成分因子才能既尽量保留数据的原始变异又能避免过度冗余。这是一个值得思考的问题。甚至能发nature,science。

加载程序包

packages <- c("ggplot2", "reshape2",  'ggpmisc', 'stringr',
              'vegan', "FactoMineR", "factoextra", "ade4", "ggpubr",'scales')

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

invisible(lapply(packages, library, character.only = TRUE))
library('dplyr')

source('utils.R')

读入并整理变量数据

dat <- read.table("../Data/GlobalAtlasv2_scores_v5.txt", header = T, sep = "\t")
head(dat)
dat$AridityIndex_wc2 = NULL
names(dat) = gsub('_18s_richness','', names(dat))
names(dat) = gsub('Richness_bacteria','Bacteria', names(dat))
names(dat) = gsub('_wc2','',names(dat))
names(dat) = gsub('WaterHoldingCapacity','WHC',names(dat))
names(dat) = gsub('Proportion_decomposers','Decomposers',names(dat))
names(dat) = gsub('Proportion_symbiotic_fungi','Sym_fungi', names(dat))
names(dat) = gsub('Pathogen_control','Pathogen_ctrl',names(dat))
names(dat) = gsub('Soil_respiration','Soil_resp',names(dat))
names(dat) = gsub('Soil_salinity','Salinity',names(dat))
names(dat) = gsub('Soil_pH','pH', names(dat))
names(dat) = gsub('Clay_silt_c','Clay_silt',names(dat))
names(dat) = gsub('Soil_ORC_g_kg','SOC', names(dat))
names(dat) = gsub("PO4_c", "PO4", names(dat))
names(dat) = gsub('Aridity_Index_v3','AridityIndex', names(dat))
dat$Salinity = log(dat$Salinity)

构建向量存储变量组合,以便后文直接引用

div_var = c("Oomycota","Bacteria", "Fungi", "Nematoda","Ciliophora", 
            "Amoebozoa","Excavata", "Apicomplexa","Platyhelminthes", "Annelida", 
            "Rotifera","Cercozoa", "Ochrophyta", "Dinoflagellata", "Arachnida", 
            "Collembola",  "Neoptera","Myriapoda", "Tardigrada",   "Chlorophyta")

func_var = c("Porosity","WHC","NH4","NO3","PO4","NAG","Soil_resp",
             "NPP","Pathogen_ctrl" ,"Sym_fungi","Decomposers")

env_var = c("pH", "Salinity", "Clay_silt", "SOC", "Plant_cover", "MAT", "PSEA", "MDR", "TSEA",'AridityIndex')

执行PCA分析

pca = PCA(scale(subset(dat, select = func_var)), graph = F, ncp = 5)
ind = get_pca_ind(pca)
dat$PC1 <- ind$coord[,1]
dat$PC2 <- ind$coord[,2]
dat$PC3 <- ind$coord[,3]
dat$PC4 <- ind$coord[,4]
dat$PC5 <- ind$coord[,5]

绘制PCA排序图

Biplot <-fviz_pca_biplot(pca, fill.ind = dat$Eco_corrected, palette = 'lancet',
                         col.ind = "white", geom.ind = "point",
                         label ="none", labelsize = 2,
                         col.var = "black", arrowsize = 0.3,
                         repel=TRUE, pointshape = 21, pointsize = 5, alpha.ci = 0.5) + 
  # scale_fill_manual(breaks = c('Forest', 'Grassland'), values = c('bisque4','chartreuse4',))+
  geom_hline(yintercept = 0, lty = 2, lwd = 0.1)+
  geom_vline(xintercept = 0, lty = 2, lwd = 0.1) +
  labs(fill = "Vegetation",
       title = NULL,
       x = sprintf("PC1  (%.2f %s)", round(pca$eig[1,2],2), '%'), 
       y = sprintf("PC2  (%.2f %s)", round(pca$eig[2,2],2), '%')) + 
  theme_classic()+
  theme(axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'),
        panel.grid = element_blank(),
        legend.background = element_blank(),
        legend.box.background = element_blank(),
        legend.position = c(0.15,0.2),
        legend.key.height = unit(0.3,'cm'),
        legend.title = element_text(size = 6,color = 'black'),
        legend.text = element_text(size = 6, color = 'black',margin = margin(l = -8)))

判定最优的主成分因子个数

pca1 <- dudi.pca(subset(dat, select = func_var), center = TRUE, scale = TRUE, scannf=FALSE, nf = 5)
test1 <- testdim(pca1, nrepet = 999)
test1

print(paste("Number of statistical significant components according to Dray et al., 2006:", test1$nb.cor))

提取各变量在主成分因子上的载荷并进行显著性判定

var <- pca$var
# (1) bootstrapped egigenvector, p value < 0.05
boot5 <- netoboot(subset(dat, select = func_var), scannf = F, nf = 5, permutations = 1000)

pvalmatrix <- computePval_peres(pca1$c1, boot5)

# (2) fixed threshold
coordPCthr_val <- rbind(abs(var$coord[,1]), abs(var$coord[,2]), abs(var$coord[,3]),
                        abs(var$coord[,4]), abs(var$coord[,5]))#Richman 1988

coordPCthr <- rbind(ifelse(abs(var$coord[,1])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,2])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,3])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,4])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,5])>0.3, TRUE, FALSE))#Richman 1988

# (3) threshold method, their contribution is above 100/ndim
contrPCthr <- rbind(ifelse(var$contrib[,1]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,2]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,3]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,4]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,5]>(100/length(func_var)), TRUE, FALSE))

contrPCthr_val <- rbind(var$contrib[,1], var$contrib[,2], var$contrib[,3],
                        var$contrib[,4], var$contrib[,5])

df.PCA.sign <- data.frame(rbind(pvalmatrix, contrPCthr_val, coordPCthr_val))

df.PCA.sign$PC <- rep(c("PC1", "PC2", "PC3", "PC4", "PC5"), 3)
df.PCA.sign$Method <- rep(c("Peres-Neto et al., 2003", "Threshold", "Threshold Richmann 1988"),each = 5)
write.table(df.PCA.sign, file = 'PCA_signtest.csv', row.names = F)

## -- define the relevant loadings for the final plotting  
pvalmatrix_TF <- ifelse(pvalmatrix < 0.05, TRUE, FALSE)

三种检验方法均指示显著差才是最终显著

pc1Relevance<-coordPCthr[1,]*pvalmatrix_TF[1,]*contrPCthr[1,]
pc2Relevance<-coordPCthr[2,]*pvalmatrix_TF[2,]*contrPCthr[2,]
pc3Relevance<-coordPCthr[3,]*pvalmatrix_TF[3,]*contrPCthr[3,]
pc4Relevance<-coordPCthr[4,]*pvalmatrix_TF[4,]*contrPCthr[4,]
pc5Relevance<-coordPCthr[5,]*pvalmatrix_TF[5,]*contrPCthr[5,]

整理结果并绘图

df.loadings <- data.frame(EFP = rep(row.names(var$coord),5),
                          val = c(var$coord[,1],var$coord[,2], var$coord[,3], 
                                  var$coord[,4], var$coord[,5]),
                          contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
                                    ifelse(pc2Relevance == 1, 'High', 'Low'),
                                    ifelse(pc3Relevance == 1, 'High', 'Low'),
                                    ifelse(pc4Relevance == 1, 'High', 'Low'),
                                    ifelse(pc5Relevance == 1, 'High', 'Low')),
                          PC = c(rep("PC1",length(row.names(var$coord))),
                                 rep("PC2",length(row.names(var$coord))),
                                 rep("PC3",length(row.names(var$coord))),
                                 rep("PC4",length(row.names(var$coord))),
                                 rep("PC5",length(row.names(var$coord)))))

df.loadings$EFP = factor(df.loadings$EFP, 
                         levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
                                    'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
                         labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',
                                    bquote("NO"[3]),bquote('NH'[4]),bquote('PO'[4]),
                                    'NAG',bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'), 
                                    bquote('Sym-fungi'~'(%)')))
                                           

loadings_p <- ggplot(data= subset(df.loadings, !PC %in% c("PC4",'PC5')), 
                     aes(x=EFP, y=val, fill = contr)) + 
  facet_grid(. ~ PC, scales = "free_y") +
  scale_y_continuous(breaks = c(-0.4,0,0.4,0.8), labels = dropLeadingZero)+
  scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
  geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
  scale_fill_manual(values=c("#E69F00", "#999999")) + theme_classic() + 
  labs(y="Loadings", x = NULL) + 
  theme(legend.position='none',
        strip.text = element_text(size = 6,color = 'black'),
        strip.background = element_blank(),
        axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'))




df.contrib <- data.frame(EFP = rep(row.names(var$contrib),5),
                         val = c(var$contrib[,1],var$contrib[,2],var$contrib[,3],
                                 var$contrib[,4],var$contrib[,5]),
                         contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
                                   ifelse(pc2Relevance == 1, 'High', 'Low'),
                                   ifelse(pc3Relevance == 1, 'High', 'Low'),
                                   ifelse(pc4Relevance == 1, 'High', 'Low'),
                                   ifelse(pc5Relevance == 1, 'High', 'Low')),
                         PC = c(rep("PC1",length(row.names(var$contrib))),
                                rep("PC2",length(row.names(var$contrib))),
                                rep("PC3",length(row.names(var$contrib))),
                                rep("PC4",length(row.names(var$contrib))),
                                rep("PC5",length(row.names(var$contrib)))))

df.contrib$EFP = factor(df.contrib$EFP, 
                        levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
                                   'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
                        labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',bquote("NO"[3]),
                                   bquote('NH'[4]),bquote('PO'[4]),'NAG',
                                   bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'), 
                                   bquote('Sym-fungi'~'(%)')))

contribs_p <- ggplot(data=subset(df.contrib, !PC %in% c("PC4",'PC5')), 
                     aes(x=EFP, y=val, fill = contr)) + 
  facet_grid(. ~ PC, scales = "free_y") +
  geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
  theme_classic() + 
  scale_fill_manual(values=c("#E69F00", "#999999")) +
  scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
  labs(y="Contribution [%]", x = NULL) + 
  theme(legend.key.size = unit(0.3, 'cm'),
        legend.title = element_text(size=6,color = 'black'),
        legend.text = element_text(size=6,color = 'black'),
        legend.background = element_blank(),
        legend.position = c(0.9, 0.5),
        strip.text = element_text(size = 6,color = 'black'),
        strip.background = element_blank(),
        axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'))

exp_var_p <- fviz_eig(pca, addlabels=F, 
               barfill="white", barcolor ="darkblue",
               ncp = 5, labelsize = .1,
               linecolor ="red") + 
  theme_classic() + 
  scale_y_continuous(expand = c(0,0))+
  geom_label(aes(x=1, y=33, label="Stretch it"), vjust= -1)+
  labs(title = NULL, x = "Principal Components (PC)", y = "Explained variance (%)") + 
  theme(axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6, color = 'black'),
        plot.margin = margin(l = 20, b = 6, t = 6, r = 6))


figure2 <- ggarrange(Biplot, exp_var_p, contribs_p, loadings_p,
                     labels = c("a", "b","c","d"), 
                     font.label=list(face = "bold", color="black",size= 10,
                                     axis.text.y = element_text(face="bold")),
                     common.legend = FALSE,
                     nrow = 2, ncol = 2)
Figure2.jpg
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
禁止转载,如需转载请通过简信或评论联系作者。
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 162,050评论 4 370
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 68,538评论 1 306
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 111,673评论 0 254
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 44,622评论 0 218
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 53,047评论 3 295
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 40,974评论 1 224
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 32,129评论 2 317
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 30,893评论 0 209
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 34,654评论 1 250
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 30,828评论 2 254
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 32,297评论 1 265
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 28,619评论 3 262
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 33,326评论 3 243
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 26,176评论 0 8
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 26,975评论 0 201
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 36,118评论 2 285
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 35,909评论 2 278

推荐阅读更多精彩内容