# Load required packages
library(car) # For ANOVA tests
library(emmeans) # For estimated marginal means
library(psych)
library(vegan)
library(ggfortify)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(ggrepel)
library(plotly)
library(broom) # For model summaries
library(patchwork) # For combining plots
library(janitor) # For cleaning names
library(tidyverse) # For data manipulation and visualization
# Set theme for all plots
theme_set(theme_minimal(base_size = 12))
# # Set options
# options(scipen = 999)Lecture 16 - Class Activity MANOVA
Lecture 16: PCA
# read file
# Load the data
darlingtonia <- read_csv("darlingtonia.csv")Rows: 87 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): site
dbl (10): height, mouth_diam, tube_diam, keel_diam, wing1_length, wing2_leng...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the structure
head(darlingtonia)# A tibble: 6 × 11
site height mouth_diam tube_diam keel_diam wing1_length wing2_length
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 TJH 654 38.4 16.6 6.4 85 76
2 TJH 413 22.2 17.2 5.9 55 26
3 TJH 610 31.2 19.9 6.7 62 60
4 TJH 546 34.4 20.8 6.3 84 79
5 TJH 665 30.5 20.4 6.6 60 51
6 TJH 665 33.6 19.5 6.6 84 66
# ℹ 4 more variables: wingsprea <dbl>, hoodmass_g <dbl>, tubemass_g <dbl>,
# wingmass_g <dbl>
darlingtonia %>%
count(site, name = "n_plants")# A tibble: 4 × 2
site n_plants
<chr> <int>
1 DG 25
2 HD 12
3 LEH 25
4 TJH 25
# Create long format for easier plotting
darl_long <- darlingtonia %>%
pivot_longer(cols = -site,
names_to = "variable",
values_to = "value")
# Box plots by site
ggplot(darl_long, aes(x = site, y = value, fill = site)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free_y", ncol = 2) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Raw Data by Site",
subtitle = "Notice the different scales!")# Example: standardize height
darlingtonia %>%
group_by(site) %>%
summarize(
mean_height = mean(height),
sd_height = sd(height),
mean_wingmass = mean(wingmass_g),
sd_wingmass = sd(wingmass_g)
)# A tibble: 4 × 5
site mean_height sd_height mean_wingmass sd_wingmass
<chr> <dbl> <dbl> <dbl> <dbl>
1 DG 619. 101. 0.225 0.0682
2 HD 574. 101. 0.0567 0.0202
3 LEH 638. 115. 0.151 0.122
4 TJH 610. 82.0 0.134 0.0853
# After standardization, both will have:
# mean = 0, sd = 1# Select only numeric variables for PCA
darl_numeric <- darlingtonia %>%
dplyr::select(-site)
# Standardize using scale()
# This centers (mean=0) and scales (sd=1) each variable
darl_scaled <- scale(darl_numeric)
darl_scaled_df <- as.data.frame(darl_scaled)
darl_scaled_df %>%
select(height, mouth_diam, wingmass_g) %>%
pivot_longer(cols = c(height, mouth_diam, wingmass_g) , names_to = "variable", values_to = "value") %>%
ggplot(aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
labs(title = "Standardized Data (same scale)") +
theme(legend.position = "none")# Calculate correlation matrix
cor_matrix <- cor(darl_numeric)
# Visualize with corrplot
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.cex = 0.7,
addCoef.col = "black",
number.cex = 0.6,
title = "Correlation Matrix\n(Darlingtonia variables)",
mar = c(0,0,2,0))# Calculate correlation matrix
cor_matrix <- cor(darl_numeric)
# Display rounded values
round(cor_matrix, 2) height mouth_diam tube_diam keel_diam wing1_length wing2_length
height 1.00 0.61 0.24 -0.06 0.28 0.29
mouth_diam 0.61 1.00 -0.05 -0.39 0.57 0.44
tube_diam 0.24 -0.05 1.00 0.65 -0.09 0.01
keel_diam -0.06 -0.39 0.65 1.00 -0.35 -0.24
wing1_length 0.28 0.57 -0.09 -0.35 1.00 0.84
wing2_length 0.29 0.44 0.01 -0.24 0.84 1.00
wingsprea 0.12 0.25 0.10 -0.19 0.64 0.71
hoodmass_g 0.49 0.76 -0.11 -0.34 0.61 0.49
tubemass_g 0.84 0.72 0.10 -0.24 0.44 0.36
wingmass_g 0.40 0.62 -0.16 -0.36 0.76 0.69
wingsprea hoodmass_g tubemass_g wingmass_g
height 0.12 0.49 0.84 0.40
mouth_diam 0.25 0.76 0.72 0.62
tube_diam 0.10 -0.11 0.10 -0.16
keel_diam -0.19 -0.34 -0.24 -0.36
wing1_length 0.64 0.61 0.44 0.76
wing2_length 0.71 0.49 0.36 0.69
wingsprea 1.00 0.25 0.16 0.39
hoodmass_g 0.25 1.00 0.76 0.80
tubemass_g 0.16 0.76 1.00 0.63
wingmass_g 0.39 0.80 0.63 1.00
# Perform PCA using prcomp()
# center = TRUE: center variables (mean = 0)
# scale. = TRUE: scale variables (sd = 1)
pca_result <- prcomp(darl_numeric,
center = TRUE,
scale = TRUE)
# View summary
summary(pca_result)Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.2332 1.3288 1.2354 0.75701 0.59301 0.51062 0.46177
Proportion of Variance 0.4987 0.1766 0.1526 0.05731 0.03517 0.02607 0.02132
Cumulative Proportion 0.4987 0.6753 0.8279 0.88521 0.92038 0.94645 0.96777
PC8 PC9 PC10
Standard deviation 0.3768 0.33982 0.25457
Proportion of Variance 0.0142 0.01155 0.00648
Cumulative Proportion 0.9820 0.99352 1.00000
# Create scree plot
fviz_eig(pca_result,
addlabels = TRUE,
ylim = c(0, 50))Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
Ignoring empty aesthetic: `width`.
# View loadings for first 3 PCs
loadings <- pca_result$rotation[, 1:3]
round(loadings, 3) PC1 PC2 PC3
height -0.278 -0.449 0.198
mouth_diam -0.369 -0.110 0.214
tube_diam 0.040 -0.602 -0.390
keel_diam 0.202 -0.491 -0.328
wing1_length -0.375 0.171 -0.280
wing2_length -0.342 0.131 -0.415
wingsprea -0.239 0.156 -0.549
hoodmass_g -0.385 -0.057 0.185
tubemass_g -0.355 -0.320 0.263
wingmass_g -0.393 0.080 -0.007
pc1_loads <- tibble(
Variable = rownames(pca_result$rotation),
Loading = pca_result$rotation[, 1]) %>%
arrange(desc(abs(Loading)))
pc1_loads# A tibble: 10 × 2
Variable Loading
<chr> <dbl>
1 wingmass_g -0.393
2 hoodmass_g -0.385
3 wing1_length -0.375
4 mouth_diam -0.369
5 tubemass_g -0.355
6 wing2_length -0.342
7 height -0.278
8 wingsprea -0.239
9 keel_diam 0.202
10 tube_diam 0.0402
# Visualize PC1 loadings
pc1_loads %>%
ggplot(aes(x = reorder(Variable, Loading), y = Loading)) +
geom_col(aes(fill = Loading > 0), show.legend = FALSE) +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "PC1 Loadings",
subtitle = "Overall Plant Size",
x = "Variable",
y = "Loading") +
scale_fill_manual(values = c("FALSE" = "coral", "TRUE" = "steelblue")) +
theme_minimal()pc2_loads <- tibble(
Variable = rownames(pca_result$rotation),
Loading = pca_result$rotation[, 2]
) %>%
arrange(desc(abs(Loading)))
pc2_loads# A tibble: 10 × 2
Variable Loading
<chr> <dbl>
1 tube_diam -0.602
2 keel_diam -0.491
3 height -0.449
4 tubemass_g -0.320
5 wing1_length 0.171
6 wingsprea 0.156
7 wing2_length 0.131
8 mouth_diam -0.110
9 wingmass_g 0.0800
10 hoodmass_g -0.0575
# Visualize PC2 loadings
pc2_loads %>%
ggplot(aes(x = reorder(Variable, Loading), y = Loading)) +
geom_col(aes(fill = Loading > 0), show.legend = FALSE) +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "PC2 Loadings",
subtitle = "Shape: Height vs. Wing Size",
x = "Variable",
y = "Loading") +
scale_fill_manual(values = c("FALSE" = "coral", "TRUE" = "steelblue")) +
theme_minimal()# PC scores are in pca_result$x
pc_scores <- as_tibble(pca_result$x) %>%
mutate(site = darlingtonia$site,
plant_id = 1:n())
# First few plants
pc_scores %>%
select(plant_id, site, PC1, PC2, PC3) %>%
head(10)# A tibble: 10 × 5
plant_id site PC1 PC2 PC3
<int> <chr> <dbl> <dbl> <dbl>
1 1 TJH -1.92 0.0772 1.53
2 2 TJH 3.35 1.53 0.755
3 3 TJH 0.918 -0.0320 0.248
4 4 TJH -0.983 0.236 -0.242
5 5 TJH 1.15 -1.02 1.54
6 6 TJH -1.37 -0.613 0.847
7 7 TJH -1.50 -1.63 0.693
8 8 TJH -0.651 0.121 -0.412
9 9 TJH 2.48 -0.0786 -0.260
10 10 TJH -1.11 -1.27 2.27
# Basic scores plot
ggplot(pc_scores, aes(x = PC1, y = PC2, color = site)) +
geom_point(size = 3, alpha = 0.7) +
stat_ellipse(level = 0.68, linetype = 2) +
labs(title = "PCA Scores Plot",
x = paste0("PC1: Overall Size"),
y = paste0("PC2: Shapeb%)"),
color = "Site") +
theme_minimal(base_size = 12) +
theme(legend.position = "right")pc_data <- as.data.frame(pca_result$x) %>%
mutate(site = darlingtonia$site)
plot_ly(pc_data,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~site,
type = "scatter3d",
mode = "markers") %>%
layout(scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
))# 1. Prepare PC Scores with Site
pc_scores_for_dist <- as.data.frame(pca_result$x) %>%
mutate(site = darlingtonia$site)
# 2. Calculate the mean PC scores for each site (the site "centroids")
site_centroids <- pc_scores_for_dist %>%
group_by(site) %>%
summarise(across(starts_with("PC"), mean)) %>%
ungroup()
# 3. Set site names as row names BEFORE calculating distances
site_centroids_matrix <- site_centroids %>%
column_to_rownames("site") # This keeps site names as row names
# 4. Calculate the Euclidean distance matrix between site centroids
dist_matrix <- dist(site_centroids_matrix, method = "euclidean")
# Display the distance matrix
dist_matrix DG HD LEH
HD 4.346434
LEH 1.621198 3.659711
TJH 1.843120 3.021386 1.960547
# Visualize the distance matrix
fviz_dist(dist_matrix,
gradient = list(low = "#00AFBB", high = "#FC4E07"))Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
Please report the issue at <https://github.com/kassambara/factoextra/issues>.