高维数据 | R语言绘图基础之主成分分析

黑妹的小屋

发布于 2020-08-06 15:05:15

1.9K0

发布于 2020-08-06 15:05:15

高维数据可视化之主成分分析

在视觉性方面,人类普遍能够感知的是二维和三维空间。对于高维数据的可视化是将高维数据投影到二维或三维空间，去掉冗余属性，同时保留高维空间的数据和特征。说白了，高维数据的可视化就是使用降维度方法，主要分成线性和非线性两大类，关于非线性的非度量多维尺度分析NMDS见往期文章非度量多维尺度分析（NMDS），关于线性的PCA方法，见往期文章PCA做图最佳搭档-ggbiplot，本文主要针对迷弥小粉丝关于绘制线性PCA图数据处理过程遇到的问题进行记录。

原始数据

#迷弥小可爱提供的原始数据，为保密具体数值，此处公布随机数生成部分数据截图如下：

原始数据来看，变量“处理组”和“时间”属于中文字符型，需要将其改成英文的字符型“Treament”和“Time”，用reader时才可被RStudio识别。

ggbiplot绘制

#数据的导入。

library(readr)
Enzyme <- read_csv("~/Desktop/主成分分析/Enzyme.csv")
#导入
View(Enzyme)

#查看Enzyme，确保导入成功，不乱码。

#数据处理。

我们处理数据的目标是研究不同的Treament对MDA，LOX等指标的影响。

>df1<-Enzyme[,2:10]
#预处理,选取Enzyme数据框中2到10列的数据形成新的矩阵df1.
> df1
# A tibble: 36 x 9
    Time   MDA   LOX   APX   PAL   CAT   POD   PPO   TPC
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1    40    41    19    58    80 186.     40    20   965
 2    40    42    20    55   155 172.     34    37   955
 3    40    43    17    51    92 166.     30    36   937
 4    40    30    23    57   208 163.     85    23  1261
 5    40    34    24    58   180 146.     89    26  1249
 6    40    38    24    52    99 167.     81    21  1269
 7    40    40    27    55   150  70.7    81    16  1322
 8    40    30    30    54   104  56.7    81    16  1312
 9    40    43    29    55   167  73.7    82    22  1357
10    40    46    27    58    96  66.7    64    22  1080
# … with 26 more rows
>Enzyme1.pca <- prcomp(df1,scale. = TRUE)
#对给定的数据矩阵进行主成分分析，并将结果作为类prcomp的对象返回。
> Enzyme1.pca
Standard deviations (1, .., p=9):
[1] 2.0909293 1.1638511 1.0689045 0.9458037 0.7227202 0.6075112 0.5181985 0.2353166 0.1451410

Rotation (n x k) = (9 x 9):
             PC1         PC2         PC3          PC4         PC5         PC6         PC7
Time  0.46315503 -0.04916147  0.03637764  0.162115952  0.09831722 -0.05605495  0.15384446
MDA   0.43051775 -0.04773755 -0.14795093 -0.069491134 -0.36795168 -0.34716366 -0.30304858
LOX  -0.09369929  0.32940027 -0.72371726 -0.441381496 -0.08318472 -0.00732021  0.37465711
APX  -0.08716532 -0.75858481 -0.10162202 -0.059575764 -0.45541633  0.34852207  0.26596939
PAL   0.18847050 -0.30570836  0.24096297 -0.784312315  0.43453792 -0.08163434 -0.03770386
CAT  -0.37539362 -0.03778714  0.33391251 -0.026114784 -0.18066354 -0.69437542  0.46568301
POD   0.41870137  0.00242730  0.02591931  0.254736824  0.31195985  0.10630817  0.65436944
PPO   0.44279023 -0.08209132 -0.12802074 -0.008043973 -0.27235413 -0.34781106 -0.01258019
TPC  -0.18960223 -0.45795031 -0.50603522  0.299564950  0.49765782 -0.36567563 -0.16354171
              PC8         PC9
Time -0.423321246  0.73470281
MDA  -0.488029855 -0.44691662
LOX  -0.086060923  0.09687049
APX  -0.054174793  0.02299334
PAL   0.013325895 -0.02694037
CAT  -0.113141408  0.03184374
POD   0.038176838 -0.46993978
PPO   0.746837400  0.16634344
TPC  -0.009080942 -0.01764606

#ggbiplot绘图

> library(ggbiplot)
>ggbiplot(Enzyme1.pca,obs.scale = 1,var.scale = 1,
groups =Enzyme$Treament,ellipse = TRUE,circle = TRUE)
+scale_color_discrete(name="")
+theme(legend.direction = 'horizontal'
,legend.position = 'top')

fviz_pca_ind绘制-1

#数据处理。

df<-Enzyme[c(2,3,4,5,6,7,8,9,10)]
View(df)

> Enzyme.pca<-PCA(df,graph=FALSE)
> Enzyme.pca
**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 36 individuals, described by 9 variables
*The results are available in the following objects:


   name               description                          
1  "$eig"             "eigenvalues"                        
2  "$var"             "results for the variables"          
3  "$var$coord"       "coord. for the variables"           
4  "$var$cor"         "correlations variables - dimensions"
5  "$var$cos2"        "cos2 for the variables"             
6  "$var$contrib"     "contributions of the variables"     
7  "$ind"             "results for the individuals"        
8  "$ind$coord"       "coord. for the individuals"         
9  "$ind$cos2"        "cos2 for the individuals"           
10 "$ind$contrib"     "contributions of the individuals"   
11 "$call"            "summary statistics"                 
12 "$call$centre"     "mean of the variables"              
13 "$call$ecart.type" "standard error of the variables"    
14 "$call$row.w"      "weights for the individuals"        
15 "$call$col.w"      "weights for the variables"

#fviz_pca_ind()绘图

> library(factoextra)
> library(FactoMineR)
> fviz_pca_ind(Enzyme.pca,geom.ind="point",pointsize=3,
pointshape=21,fill.ind=Enzyme$Treament,
palette=c("#00AFBB","#E7B800","#FC4E07","blue"),
addEllipses=TRUE,legend.titl="Groups",title="")
+theme_grey()

fviz_pca_ind绘制-2

#数据处理。

我们处理数据的目标是研究在不同的作用时间Time对MDA，LOX等指标的影响。若直接按照fviz_pca_ind绘制-1提供的方法绘制PCA，这时候会出现以下错误：

>fviz_pca_ind(Enzyme2.pca,geom.ind="point",pointsize=4,pointshape=21,fill.ind=Enzyme$Time,palette=c("#00AFBB","#E7B800","#FC4E07"),addEllipses=TRUE,legend.titl="Groups",title="")+theme_grey()
错误: Continuous value supplied to discrete scale

这是什么原因呢？？？我们来查看数据框

>str(Enzyme$Time)
num [1:36] 40 40 40 40 40 40 40 40 40 40 ...
#查看数据框中某列的数值类型，你会发现是num数值型，
而fill.ind=Enzyme$Time这列数据需要因子向量型。
>as.character(Enzyme$Time)
 [1] "40"  "40"  "40"  "40"  "40"  "40"  "40"  "40"  "40"  "40"  "40"  "40"  "80"  "80"  "80" 
[16] "80"  "80"  "80"  "80"  "80"  "80"  "80"  "80"  "80"  "120" "120" "120" "120" "120" "120"
[31] "120" "120" "120" "120" "120" "120"
#将Time所在列的数据转换为字符型的因子向量。
>Enzyme$Time<-as.character(Enzyme$Time)
#将数据框中的某列的数值转换为因子向量，变量名仍为Time
>str(Enzyme$Time)
  chr [1:36] "40" "40" "40" "40" "40" "40" "40" "40" "40" "40" "40" "40" "80" "80" "80" "80" ...
#再次查看，Time的类型，发现修改成功，变成字符型chr了。

#绘图

> library(factoextra)
> library(FactoMineR)
> df2<-Enzyme[,3:10]
> df2
# A tibble: 36 x 8
     MDA   LOX   APX   PAL   CAT   POD   PPO   TPC
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1    41    19    58    80 186.     40    20   965
 2    42    20    55   155 172.     34    37   955
 3    43    17    51    92 166.     30    36   937
 4    30    23    57   208 163.     85    23  1261
 5    34    24    58   180 146.     89    26  1249
 6    38    24    52    99 167.     81    21  1269
 7    40    27    55   150  70.7    81    16  1322
 8    30    30    54   104  56.7    81    16  1312
 9    43    29    55   167  73.7    82    22  1357
10    46    27    58    96  66.7    64    22  1080
# … with 26 more rows
> Enzyme2.pca<-PCA(df2,graph=FALSE)

> Enzyme2.pca
**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 36 individuals, described by 8 variables
*The results are available in the following objects:

   name               description                          
1  "$eig"             "eigenvalues"                        
2  "$var"             "results for the variables"          
3  "$var$coord"       "coord. for the variables"           
4  "$var$cor"         "correlations variables - dimensions"
5  "$var$cos2"        "cos2 for the variables"             
6  "$var$contrib"     "contributions of the variables"     
7  "$ind"             "results for the individuals"        
8  "$ind$coord"       "coord. for the individuals"         
9  "$ind$cos2"        "cos2 for the individuals"           
10 "$ind$contrib"     "contributions of the individuals"   
11 "$call"            "summary statistics"                 
12 "$call$centre"     "mean of the variables"              
13 "$call$ecart.type" "standard error of the variables"    
14 "$call$row.w"      "weights for the individuals"        
15 "$call$col.w"      "weights for the variables"  
> fviz_pca_ind(Enzyme2.pca,geom.ind="point",
pointsize=4,pointshape=21,fill.ind=Enzyme$Time,
palette=c("#00AFBB","#E7B800","#FC4E07"),
addEllipses=TRUE,legend.titl="Groups",title="")
+theme_grey()