给PCA画个圈圈儿
给PCA的结果加上轮廓范围
1.了解PCA
今天的主要目的就是给PCA的分组画图画个圈圈儿,对PCA的认识要有一定的基础,如果你不知道何为PCA以及对PCA的结果数据不太明白,强烈推荐补习一下知识。参见生信技能树文章 PCA-弱水三千,取哪一瓢饮 和 PCA-Statistics is the new sexy!!!
接着认识下PCA的结果,拆解并了解PCA结果构造并画图
2.拆解PCA结果
强烈建议手动运行以下代码
#代码来自这里 https://huboqiang.cn/2016/03/03/RscatterPlotPCA
head(iris)
df <- iris
df <- as.data.frame(iris)
row.names(df) <- paste(df$Species, row.names(df), sep="_")
df$Species <- NULL
head(df)
df_pca <- prcomp(df)
plot(df_pca$x[,1], df_pca$x[,2])
df_out <- as.data.frame(df_pca$x)
df_out$group <- sapply( strsplit(as.character(row.names(df)), "_"), "[[", 1 )
head(df_out)
library(ggplot2)
library(grid)
library(gridExtra)
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()
p
theme<-theme(panel.background = element_blank(),panel.border=element_rect(fill=NA),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),strip.background=element_blank(),axis.text.x=element_text(colour="black"),axis.text.y=element_text(colour="black"),axis.ticks=element_line(colour="black"),plot.margin=unit(c(1,1,1,1),"line"))
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme
p
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group, label=row.names(df) ))
p<-p+geom_point()+ geom_text(size=3)+theme
p
percentage <- round(df_pca$sdev / sum(df_pca$sdev) * 100, 2)
percentage <- paste( colnames(df_out), "(", paste( as.character(percentage), "%", ")", sep="") )
#注意这个技巧
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme + xlab(percentage[1]) + ylab(percentage[2])
p
df_out$group <- factor(df_out$group, levels = c("virginica", "setosa", "versicolor"))
p<-ggplot(df_out,aes(x=PC1,y=PC2,color=group ))
p<-p+geom_point()+theme + xlab(percentage[1]) + ylab(percentage[2]) + scale_color_manual(values=c("#FFFF00", "#00FFFF", "#FF00FF"))
p
pdf("file_out.pdf",width = 10,height = 10)
library(gridExtra)
yy <- grid.arrange(p,nrow=1)
op <- par(no.readonly=TRUE)
par(op)
dev.off()
3.给PCA结果加上轮廓
构造完数据你可以看到用prcomp生成的PCA结果每一部分的作用;
splitData <- split(df_out, df_out$group)
appliedData <- lapply(splitData, function(df){
df[chull(df), ] # chull really is useful, even outside of contrived examples.
})
#注意这个split和lapply技巧,chull函数只对前两列起作用;
combinedData <- do.call(rbind, appliedData)
zp3 <- ggplot(data = df_out,
aes(x = PC1, y = PC2,shape=group,color=group))+
geom_point(size=3) +
theme_bw()
zp3 <- zp3 + geom_polygon(data = combinedData, # This is also a nice example of how to plot
aes(x = PC1, y = PC2,fill=group,color=group), # two superimposed geoms
alpha = 0.5, # color 参数设置边框颜色,size 参数设置边框大小
size=1 #加了边框
) # from different data.frames
zp3 <- zp3 + coord_equal()
# zp3 <- zp3 +
# scale_fill_manual(values = colorRampPalette(rev(brewer.pal(11, "Spectral")))(3))
# 设置了填充色;
# install.packages("plotly")
library(plotly)
fig <- ggplotly(zp3) # 这个R包也是棒呆
fig
执行完以上代码你会发现有个 chull 函数,他就是用来统计数据的凸点的位置;参数支持两个拥有两个列的一组数据或者或者两个向量,就是这个函数支持数据框的前两列,这里是满足PCA结果单作图要求的;
# chull 函数,统计边界点的值;貌似很有用
# NOT RUN {
X <- matrix(stats::rnorm(2000), ncol = 2)
chull(X)
# }
# NOT RUN {
# Example usage from graphics package
plot(X, cex = 0.5)
hpts <- chull(X)
hpts <- c(hpts, hpts[1])
lines(X[hpts, ])
# }
4.插入圆形轮廓
如果你想插入一个圆形轮廓,那继续看这里
p3 <- ggplot(data = df_out,
aes(x = PC1, y = PC2,shape=group,color=group))+
geom_point(size=3) +
theme_bw()
zp3 <- p3 +stat_ellipse(geom="polygon", aes(fill = group),
alpha = 0.1,
show.legend = FALSE,
level = 0.95,
type = "norm")
zp3
ggplotly(zp3)
另外感兴趣的话试试不同参数的圈地范围吧
p3 + stat_ellipse(type = "t")
p3 + stat_ellipse(type = "norm")
p3 + stat_ellipse(type = "euclid")