What are we using FactoMineR and factoextra for?

# install.packages("FactoMineR")
# install.packages("factoextra")

library(FactoMineR)
library(ggplot2)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

What do we need to know about PCA?

A Simple Example

PCA
data("iris") # built in data set
head(iris) # 4 continuous variables and 1 categorical variable
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# make a new dataset that only has the continuous variables
# we will use the Species later
iris2 <- iris[1:4]
head(iris2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
# PCA analysis to get PCs
iris.pca <- PCA(iris2, scale.unit = TRUE, graph = FALSE)

# look at eigenvalues (measure of how much variance is contained in a PC)
# there are the same number of PCs as original variables
iris.pca$eig
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.91849782             72.9624454                          72.96245
## comp 2 0.91403047             22.8507618                          95.81321
## comp 3 0.14675688              3.6689219                          99.48213
## comp 4 0.02071484              0.5178709                         100.00000
Scree plot
# Scree plot with factoextra
fviz_screeplot(iris.pca, ncp = 4) # ncp = number of PCs to show

Plots with individuals and contributions of variables
# Simple PCA factor map with FactoMineR graphics
plot.PCA(iris.pca, axes = c(1,2), choix = "var")

# shows us the directions of the contributions of the original variables
# variables pointing the same direction are correlated with each other

# Biplot with factoextra
# Biplot means it has individuals and contributions
fviz_pca(iris.pca)

# include only contributions of variables to clean it up
fviz_pca_var(iris.pca, col.var = "contrib")

# Control scale colors
fviz_pca_var(iris.pca, col.var = "contrib") +
  scale_color_gradient2(low = "blue", mid = "steelblue", high = "red", midpoint = 25.0) +
  theme_void()

# include only individuals with no labels
fviz_pca_ind(iris.pca, label = "none")

# use Species from iris to change habillage
fviz_pca_ind(iris.pca, label="none", habillage = iris$Species)

# changed color and shape

# add ellipses
fviz_pca_ind(iris.pca, label="none", habillage = iris$Species,
             addEllipses = TRUE, ellipse.level = 0.95)

Biplot
# Make a pretty biplot
fviz_pca_biplot(iris.pca,
                # individuals
                geom.ind = "point",
                fill.ind = iris$Species, col.ind = "black",
                pointshape = 21, pointsize = 2,
                addEllipses = TRUE,
                # variables
                col.var = "contrib",
                gradient.cols = "RdYlBu",
                legend.title = list(fill = "Species",
                                    color = "Contrib",
                                    alpha = "Contrib")) +
  scale_color_gradient2(low = "blue", mid = "steelblue", high = "red", midpoint = 25.0)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.