导语
GUIDE ╲
PCAtools是一个非常全面的主成分分析工具!
背景介绍
主成分分析 (PCA) 在数据科学、生物信息学等多个领域具有广泛的适用性。作为一种数学降维方法, PCA利用正交变换 (orthogonal transformation)将一系列可能线性相关的变量转换为一组线性不相关的新变量,也称为主成分,从而利用新变量在更小的维度下展示数据的特征。
PCAtools 提供了进行 PCA数据分析的功能,并且用户可以对结果进行多样的可视化。
R包安装
BiocManager::install("PCAtools")
library(PCAtools
#install development version
devtools::install_github('kevinblighe/PCAtools')
可视化展示
01
DESeq2
library(airway)
library(magrittr)
data('airway')
airway$dex %<>% relevel('untrt')
##对gene symbol注释ENSEMBL id
ens <- rownames(airway)
library(org.Hs.eg.db)
symbols <- mapIds(org.Hs.eg.db, keys = ens,
column = c('SYMBOL'), keytype = 'ENSEMBL')
symbols <- symbols[!is.na(symbols)]
symbols <- symbols[match(rownames(airway), names(symbols))]
rownames(airway) <- symbols
keep <- !is.na(rownames(airway))
airway <- airway[keep,]
###标准化数据
library('DESeq2')
dds <- DESeqDataSet(airway, design = ~ cell + dex)
dds <- DESeq(dds)
vst <- assay(vst(dds))
##PCA
p <- pca(vst, metadata = colData(airway), removeVar = 0.1)
scree plot
screeplot(p, axisLabSize = 18, titleLabSize = 22)
bi-plot
biplot(p, showLoadings = TRUE,
labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
02
GEO
首先加载乳腺癌基因表达数据和RFS数据
#从GEO下载数据集
gset <- getGEO('GSE2990', GSEMatrix = TRUE, getGPL = FALSE)
x <- exprs(gset[[1]])
#移除Affymetrix control探针
x <- x[-grep('^AFFX', rownames(x)),]
#从pdata中提取对应信息
idx <- which(colnames(pData(gset[[1]])) %in%
c('age:ch1', 'distant rfs:ch1', 'er:ch1',
'ggi:ch1', 'grade:ch1', 'size:ch1',
'time rfs:ch1'))
metadata <- data.frame(pData(gset[[1]])[,idx],
row.names = rownames(pData(gset[[1]])))
#设置列名
colnames(metadata) <- c('Age', 'Distant.RFS', 'ER', 'GGI', 'Grade',
'Size', 'Time.RFS')
#准备一些表型
metadata$Age <- as.numeric(gsub('^KJ', NA, metadata$Age))
metadata$Distant.RFS <- factor(metadata$Distant.RFS, levels=c(0,1))
metadata$ER <- factor(gsub('\\?', NA, metadata$ER), levels=c(0,1))
metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'), levels = c('ER-', 'ER+'))
metadata$GGI <- as.numeric(metadata$GGI)
metadata$Grade <- factor(gsub('\\?', NA, metadata$Grade), levels=c(1,2,3))
metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade)))
metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3'))
metadata$Size <- as.numeric(metadata$Size)
metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS))
#从pdata中删除含NA值的样本
discard <- apply(metadata, 1, function(x) any(is.na(x)))
metadata <- metadata[!discard,]
#过滤表达数据,匹配pdata中的样本
x <- x[,which(colnames(x) %in% rownames(metadata))]
#检查样本名称是否在pdata和表达式数据之间完全匹配
all(colnames(x) == rownames(metadata))
bi-plot
biplot(p)
biplot(p, showLoadings = TRUE, lab = NULL)
pairs plot
pairsplot(p)
loadings plot
plotloadings(p, labSize = 3)
eigencor plot
eigencorplot(p,
metavars = c('Study','Age','Distant.RFS','ER',
'GGI','Grade','Size','Time.RFS'))
03
进阶功能
PCAtools中的功能几乎应涵盖用户的各种需求。接下来的部分将介绍其中一些高级功能,并进行实际演示,说明如何使用 PCAtools 对数据进行临床分析。
首先,通过将探针ID映射到gene symbol来进行基因注释
suppressMessages(require(hgu133a.db))
newnames <- mapIds(hgu133a.db,
keys = rownames(p$loadings),
column = c('SYMBOL'),
keytype = 'PROBEID')
# tidy up for NULL mappings and duplicated gene symbols
newnames <- ifelse(is.na(newnames) | duplicated(newnames),
names(newnames), newnames)
rownames(p$loadings) <- newnames
为了保留最佳PCs数,PCAtools提供了四个方法:
我们可以绘制一个新的scree plot,并在图中标记不同方法得到的结果
library(ggplot2)
#两种方法
horn <- parallelPCA(x)
horn$n
elbow <- findElbowPoint(p$variance)
elbow
#绘图
screeplot(p,
components = getComponents(p, 1:20),
vline = c(horn$n, elbow)) +
geom_label(aes(x = horn$n + 1, y = 50,
label = 'Horn\'s', vjust = -1, size = 8)) +
geom_label(aes(x = elbow + 1, y = 50,
label = 'Elbow method', vjust = -1, size = 8))
修改bi-plot
比较PC1与PC2的bi-plot是PCA中最具特征的图。在bi-plot中,我们可以按不同的组对点进行着色并添加更多特征。
按metadata因素着色,使用自定义标签,通过原点添加线条,并添加图例
biplot(p,
lab = paste0(p$metadata$Age, ' años'),
colby = 'ER',
hline = 0, vline = 0,
legendPosition = 'right')
按组提供自定义颜色和包围变量
encircle功能实际上是在colby指定的每个组周围绘制一个多边形。
biplot(p,
colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
colLegendTitle = 'ER-\nstatus',
# encircle config
encircle = TRUE,
encircleFill = TRUE,
hline = 0, vline = c(-25, 0, 25),
legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)
统计椭圆
在这里,我们以 95% 的置信度在每个组周围绘制椭圆:
biplot(p,
colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 1/4,
ellipseLineSize = 1.0,
xlim = c(-125,125), ylim = c(-50, 80),
hline = 0, vline = c(-25, 0, 25),
legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)
biplot(p,
colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 1/4,
ellipseLineSize = 0,
ellipseFillKey = c('ER+' = 'yellow', 'ER-' = 'pink'),
xlim = c(-125,125), ylim = c(-50, 80),
hline = 0, vline = c(-25, 0, 25),
legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)
pairs plot
我们可以通过设置“triangle = FALSE”以更好地利用空间排列图片。在这种情况下,我们可以使用“ncol”和“nrow”参数进一步控制布局。
pairsplot(p,
components = getComponents(p, c(4,33,11,1)),
triangle = FALSE,
hline = 0, vline = 0,
pointSize = 0.8,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'ER',
title = 'Pairs plot', titleLabSize = 22,
axisLabSize = 14, plotaxes = TRUE,
margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm'))
将主成分与临床数据相关联
eigencorplot(p,
components = getComponents(p, 1:27),
metavars = c('Study','Age','Distant.RFS','ER',
'GGI','Grade','Size','Time.RFS'),
col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'),
cexCorval = 0.7,
colCorval = 'white',
fontCorval = 2,
posLab = 'bottomleft',
rotLabX = 45,
posColKey = 'top',
cexLabColKey = 1.5,
scale = TRUE,
main = 'PC1-27 clinical correlations',
colFrame = 'white',
plotRsquared = FALSE)
进行显著性标记
eigencorplot(p,
components = getComponents(p, 1:horn$n),
metavars = c('Study','Age','Distant.RFS','ER','GGI',
'Grade','Size','Time.RFS'),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2,
fontCorval = 2,
posLab = 'all',
rotLabX = 45,
scale = TRUE,
main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))
小编总结
PCAtools是专门用于进行PCA分析的R包,功能还是相当丰富的,小伙伴们在进行PCA分析的时候可以多多学习哦!
END