PCA Tutorial
A tutorial on how to run principal component analyses and heirarchical clustering in R
Introduction
Principal component analysis (PCA) is a statistical process commonly used for dimensionality reduction by projecting each data point onto only the first few principal components to obtain lower-dimensional data while preserving as much of the data’s variation as possible. The first principal component can equivalently be defined as a direction that maximizes the variance of the projected data.
For more info: https://en.wikipedia.org/wiki/Principal_component_analysis
# devtools::install_github("derekmichaelwright/agData")
library(agData)
library(scales) # rescale()
library(GGally) # ggpairs()
library(plotly) # plot_ly()
library(rworldmap) # mapPies()
library(FactoMineR) # PCA() & HCPC()
library(htmlwidgets) # saveWidget()
# Prep data
<- "www.dblogr.com/ or derekmichaelwright.github.io/dblogr/ | Data: Anderson (1935) & Fisher (1936)"
myCaption1 <- "www.dblogr.com/ or derekmichaelwright.github.io/dblogr/ | Data: AGILE" myCaption2
Iris Data Set
Data Source
- Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. Annals of Eugenics. 7(II): 179–188.
- Anderson, Edgar (1935) The irises of the Gaspe Peninsula. Bulletin of the American Iris Society. 59: 2–5.
# print the data
::datatable(iris) DT
Correlations
# Plot
<- ggpairs(iris, columns = 1:4, aes(color = Species)) +
mp scale_color_manual(values = alpha(agData_Colors,0.7)) +
scale_fill_manual(values = alpha(agData_Colors,0.7)) +
theme_agData() +
labs(title = "Iris Data Set by Species", caption = myCaption1)
ggsave("pca_tutorial_1_01.png", mp, width = 8, height = 6)
PCA
# PCA
<- PCA(iris[,1:4], graph = F)
mypca # HPCA
<- HCPC(mypca, nb.clust = 3, graph = F)
myhpca #
<- mypca[[3]]$coord %>% as.data.frame()
pcs <- iris %>%
xx mutate(Cluster = myhpca[[1]]$clust) %>%
bind_cols(pcs)
#
<- c("darkslategrey", "darkred", "darkgreen")
myColors <- c(16, 13, 15)
myShapes # Plot
<- ggplot(xx, aes(x = Dim.1, y = Dim.2)) +
mp geom_point(aes(color = Species, pch = Cluster), size = 2, alpha = 0.8) +
scale_color_manual(values = myColors) +
scale_shape_manual(values = myShapes) +
theme_agData() +
labs(title = "Iris Data Set", x = "PC 1", y = "PC 2", caption = myCaption1)
ggsave("pca_tutorial_1_02.png", mp, width = 6, height = 4)
ggforce::geom_mark_hull
# Plot
<- mp +
mp ::geom_mark_hull(aes(fill = Cluster), expand = unit(3, "mm")) +
ggforcescale_fill_manual(values = myColors)
ggsave("pca_tutorial_1_03.png", mp, width = 6, height = 4)
3D Plots
Species
https://derekmichaelwright.github.io/dblogr/academic/pca_tutorial/pca_tutorial_iris_1.html
<- plot_ly(iris, x = ~Sepal.Length, y = ~Sepal.Width, z = ~Petal.Length, text = ~Species,
mp color = ~Species, colors = myColors,
symbol = ~Species, symbols = myShapes) %>%
add_markers()
::saveWidget(mp, file="pca_tutorial_iris_1.html") htmlwidgets
PCA
https://derekmichaelwright.github.io/dblogr/academic/pca_tutorial/pca_tutorial_iris_2.html
<- plot_ly(xx, x = ~Dim.1, y = ~Dim.2, z = ~Dim.3, text = ~Species,
mp color = ~Species, colors = myColors,
symbol = ~Cluster, symbols = myShapes) %>%
add_markers()
::saveWidget(mp, file="pca_tutorial_iris_2.html") htmlwidgets
LDP Data Set
Data Source
Derek M. Wright, Sandesh Neupane, Taryn Heidecker, Teketel A. Haile, Crystal Chan, Clarice J. Coyne, Rebecca J. McGee, Sripada Udupa, Fatima Henkrar, Eleonora Barilli, Diego Rubiales, Tania Gioia, Giuseppina Logozzo, Stefania Marzario, Reena Mehra, Ashutosh Sarker, Rajeev Dhakal, Babul Anwar, Debashish Sarker, Albert Vandenberg & Kirstin E. Bett. Understanding photothermal interactions can help expand production range and increase genetic diversity of lentil (Lens culinaris Medik.). Plants, People, Planet. (2020) 00: 1-11. doi.org/10.1002/ppp3.10158
Raw Data
# Prep data
<- read.csv("data_ldp.csv")
myLDP <- read.csv("data_phenology.csv")
myD <- read.csv("data_countries.csv")
myC <- c("Ro16", "Ro17", "Su16", "Su17", "Su18", "Us18",
myExpts "In16", "In17", "Ba16", "Ba17", "Ne16", "Ne17",
"Sp16", "Sp17", "Mo16", "Mo17", "It16", "It17" )
<- c("darkred", "darkorange3", "darkgoldenrod2", "deeppink3",
myColors "steelblue", "darkorchid4", "cornsilk4", "darkgreen")
<- myD %>%
xx mutate(Expt = factor(Expt, levels = myExpts)) %>%
group_by(Expt) %>%
mutate(DTF_Scaled = rescale(DTF, c(1,5))) %>%
ungroup() %>%
gather(Trait, Value, DTF, DTF_Scaled)
<- ggplot(xx, aes(x = Expt, y = Value)) +
mp geom_quasirandom(size = 0.2, alpha = 0.5) +
facet_grid(Trait ~ ., scales = "free_y") +
theme_agData() +
labs(y = NULL, x = NULL, caption = myCaption2)
ggsave("pca_tutorial_2_01.png", mp, width = 8, height = 6)
PCA
# Prep data
<- read.csv("data_phenology.csv") %>%
xx mutate(Expt = factor(Expt, levels = myExpts),
DTF = round(DTF)) %>%
select(Entry, Expt, DTF) %>%
spread(Expt, DTF) %>%
column_to_rownames("Entry")
::datatable(xx) DT
# PCA
<- PCA(xx, ncp = 10, graph = F)
mypca # Heirarcical clustering
<- HCPC(mypca, nb.clust = 8, graph = F)
mypcaH # Bind together
<- mypcaH[[1]] %>% rownames_to_column("Entry")
x1 <- mypca[[3]]$coord %>% as.data.frame() %>% rownames_to_column("Entry")
x2 <- left_join(x1, x2, by = "Entry") %>%
myPC mutate(Entry = as.numeric(Entry)) %>%
left_join(select(myLDP, Entry, Name, Origin), by = "Entry") %>%
left_join(select(myC, Origin=Country, Region), by = "Origin") %>%
select(Entry, Name, Origin, Region, Cluster=clust,
PC1=Dim.1, PC2=Dim.2, PC3=Dim.3, PC4=Dim.4, PC5=Dim.5,
PC6=Dim.6, PC7=Dim.7, PC8=Dim.8, PC9=Dim.9, PC10=Dim.10)
# Print PC's
::datatable(myPC) DT
PCA Plots
# Plot
<- ggplot(myPC, aes(x = PC1, y = PC2, color = Cluster)) +
mp geom_point() +
scale_color_manual(values = myColors) +
theme_agData(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()) +
labs(caption = myCaption2)
ggsave("pca_tutorial_2_02.png", mp, width = 6, height = 4)
https://derekmichaelwright.github.io/dblogr/academic/pca_tutorial/pca_tutorial_ldp_1.html
<- plot_ly(myPC, x = ~PC1, y = ~PC2*2.5, z = ~PC3*2.5,
mp text = ~paste(Entry, "|", Name, "\nOrigin:", Origin),
color = ~Cluster, colors = myColors) %>%
add_markers()
::saveWidget(mp, file="pca_tutorial_ldp_1.html") htmlwidgets
PC1 vs PC2 vs PC3
# Prep data
<- round(mypca[[1]][,2], 1)
perc # Plot PCA 1v2
<- function(df) df[chull(df[,"PC1"], df[,"PC2"]), ]
find_hull <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
polys <- ggplot(myPC) +
mp1 geom_polygon(data = polys, alpha = 0.15, aes(x = PC1, y = PC2, fill = Cluster)) +
geom_point(aes(x = PC1, y = PC2, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC1 (", perc[1], "%)"),
y = paste0("PC2 (", perc[2], "%)"))
# Plot PCA 1v3
<- function(df) df[chull(df[,"PC1"], df[,"PC3"]), ]
find_hull <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
polys <- ggplot(myPC) +
mp2 geom_polygon(data = polys, alpha = 0.15, aes(x = PC1, y = PC3, fill = Cluster)) +
geom_point(aes(x = PC1, y = PC3, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC1 (", perc[1], "%)"),
y = paste0("PC3 (", perc[3], "%)"))
# Plot PCA 2v3
<- function(df) df[chull(df[,"PC2"], df[,"PC3"]), ]
find_hull <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
polys <- ggplot(myPC) +
mp3 geom_polygon(data = polys, alpha = 0.15, aes(x = PC2, y = PC3, fill = Cluster)) +
geom_point(aes(x = PC2, y = PC3, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC2 (", perc[2], "%)"),
y = paste0("PC3 (", perc[3], "%)"),
caption = myCaption2)
# Append
<- ggarrange(mp1, mp2, mp3, nrow = 1, ncol = 3, hjust = 0)
mp ggsave("pca_tutorial_2_03.png", mp, width = 8, height = 2.5)
Phenotypes by Cluster Group
# Prep data
<- read.csv("data_phenology.csv") %>%
xx mutate(Expt = factor(Expt, levels = myExpts)) %>%
group_by(Expt) %>%
mutate(DTF_Scaled = scales::rescale(DTF, c(1,5)),
DTF_Scaled = round(DTF_Scaled,1)) %>%
ungroup() %>%
select(Entry, Expt, DTF_Scaled) %>%
left_join(select(myPC, Entry, Cluster), by = "Entry") %>%
group_by(Expt, Cluster) %>%
summarise(mean = mean(DTF_Scaled, na.rm = T), sd = sd(DTF_Scaled, na.rm = T)) %>%
ungroup()
# Plot
<- ggplot(xx, aes(x = Expt, y = mean, group = Cluster)) +
mp geom_point(aes(color = Cluster)) +
geom_vline(xintercept = 6.5, lty = 2) +
geom_vline(xintercept = 12.5, lty = 2) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Cluster),
alpha = 0.2, color = NA) +
geom_line(aes(color = Cluster), size = 1) +
scale_color_manual(values = myColors) +
scale_fill_manual(values = myColors) +
theme_agData() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "none", panel.grid = element_blank()) +
labs(y = "DTF (scaled 1-5)", x = NULL, caption = myCaption2)
#
ggsave("pca_tutorial_2_04.png", mp, width = 8, height =3.5)
Pie Map
# Plot
<- myPC %>%
xx filter(!Origin %in% c("ICARDA","USDA","Unknown")) %>%
group_by(Origin, Cluster) %>%
summarise(Count = n()) %>%
spread(Cluster, Count) %>%
left_join(select(myC, Origin=Country, Lat, Lon), by = "Origin") %>%
ungroup() %>%
as.data.frame()
is.na(xx)] <- 0
xx[png("pca_tutorial_2_05.png", width = 4800, height = 2200, res = 600)
par(mai = c(0,0,0,0), xaxs = "i", yaxs = "i")
mapPies(dF = xx, nameX = "Lon", nameY = "Lat", zColours = myColors,
nameZs = c("1","2","3","4","5","6","7","8"), lwd = 1,
xlim = c(-140,110), ylim = c(0,20), addCatLegend = F,
oceanCol = "grey90", landCol = "white", borderCol = "black")
legend(-139.5, 15.5, title = "Cluster", legend = 1:8, col = myColors,
pch = 16, cex = 1, pt.cex = 2, box.lwd = 2)
dev.off()