给PCA画个圈圈儿

给PCA的结果加上轮廓范围

1.了解PCA

今天的主要目的就是给PCA的分组画图画个圈圈儿,对PCA的认识要有一定的基础,如果你不知道何为PCA以及对PCA的结果数据不太明白,强烈推荐补习一下知识。参见生信技能树文章 PCA-弱水三千,取哪一瓢饮PCA-Statistics is the new sexy!!!

接着认识下PCA的结果,拆解并了解PCA结果构造并画图

2.拆解PCA结果

强烈建议手动运行以下代码

#代码来自这里 https://huboqiang.cn/2016/03/03/RscatterPlotPCA

head(iris)
df <- iris
df <- as.data.frame(iris)
row.names(df) <- paste(df$Species, row.names(df), sep="_") 
df$Species <- NULL

head(df)
df_pca <- prcomp(df)
plot(df_pca$x[,1], df_pca$x[,2])

df_out <- as.data.frame(df_pca$x)
df_out$group <- sapply( strsplit(as.character(row.names(df)), "_"), "[[", 1 )
head(df_out)
library(ggplot2)
library(grid)
library(gridExtra)
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()
p
theme<-theme(panel.background = element_blank(),panel.border=element_rect(fill=NA),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),strip.background=element_blank(),axis.text.x=element_text(colour="black"),axis.text.y=element_text(colour="black"),axis.ticks=element_line(colour="black"),plot.margin=unit(c(1,1,1,1),"line"))

p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme
p
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group, label=row.names(df) ))
p<-p+geom_point()+ geom_text(size=3)+theme
p
percentage <- round(df_pca$sdev / sum(df_pca$sdev) * 100, 2)

percentage <- paste( colnames(df_out), "(", paste( as.character(percentage), "%", ")", sep="") )
#注意这个技巧

p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme + xlab(percentage[1]) + ylab(percentage[2])
p


df_out$group <- factor(df_out$group, levels = c("virginica", "setosa", "versicolor"))

p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme + xlab(percentage[1]) + ylab(percentage[2]) + scale_color_manual(values=c("#FFFF00", "#00FFFF", "#FF00FF"))
p

pdf("file_out.pdf",width = 10,height = 10)
library(gridExtra)
yy <- grid.arrange(p,nrow=1)
op <- par(no.readonly=TRUE)
par(op)
dev.off()

3.给PCA结果加上轮廓

构造完数据你可以看到用prcomp生成的PCA结果每一部分的作用;

splitData <- split(df_out, df_out$group)
appliedData <- lapply(splitData, function(df){
  df[chull(df), ]  # chull really is useful, even outside of contrived examples.
})
#注意这个split和lapply技巧,chull函数只对前两列起作用;
combinedData <- do.call(rbind, appliedData)
zp3 <- ggplot(data = df_out,
              aes(x = PC1, y = PC2,shape=group,color=group))+
              geom_point(size=3) + 
              theme_bw()
zp3 <- zp3 + geom_polygon(data = combinedData,  # This is also a nice example of how to plot
                          aes(x = PC1, y = PC2,fill=group,color=group),  # two superimposed geoms
                          alpha = 0.5,   # color 参数设置边框颜色,size 参数设置边框大小
                          size=1 #加了边框
                          )    # from different data.frames
zp3 <- zp3 + coord_equal()
# zp3 <- zp3 + 
          # scale_fill_manual(values = colorRampPalette(rev(brewer.pal(11, "Spectral")))(3)) 
          # 设置了填充色;
# install.packages("plotly")
library(plotly)
fig <- ggplotly(zp3) # 这个R包也是棒呆
fig
PCA_border

执行完以上代码你会发现有个 chull 函数,他就是用来统计数据的凸点的位置;参数支持两个拥有两个列的一组数据或者或者两个向量,就是这个函数支持数据框的前两列,这里是满足PCA结果单作图要求的;

# chull 函数,统计边界点的值;貌似很有用
# NOT RUN {
X <- matrix(stats::rnorm(2000), ncol = 2)
chull(X)
# }
# NOT RUN {
# Example usage from graphics package
plot(X, cex = 0.5)
hpts <- chull(X)
hpts <- c(hpts, hpts[1])
lines(X[hpts, ])
# }

4.插入圆形轮廓

如果你想插入一个圆形轮廓,那继续看这里

p3 <- ggplot(data = df_out,
              aes(x = PC1, y = PC2,shape=group,color=group))+
              geom_point(size=3) + 
              theme_bw()

zp3 <- p3 +stat_ellipse(geom="polygon", aes(fill = group), 
               alpha = 0.1, 
               show.legend = FALSE, 
               level = 0.95,
               type = "norm")
zp3
ggplotly(zp3)
PCA_stat_ellipse

另外感兴趣的话试试不同参数的圈地范围吧

p3 + stat_ellipse(type = "t")
p3 + stat_ellipse(type = "norm")
p3 + stat_ellipse(type = "euclid")

参考:

  1. https://mp.weixin.qq.com/s/MUr_wcS3c5KFknB2dCCc0g
  2. https://huboqiang.cn/2016/03/03/RscatterPlotPCA
  3. https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html
  4. https://plotly.com/ggplot2/geom_polygon/

comments powered by Disqus