# Setting the working directory.
setwd("C:/...") # This is unique for each computer or person.
# Download data.
PCASample <- read.csv(file="....csv",header=TRUE)
# Setting up the data.
Nsamp <- length(PCASample[,1]) # The number of entities in your data (number of rows).
Psamp <- length(PCASample[1,]) # The number of variables of interest (number of columns).
# Testing for skewness.
library(moments)
apply(PCASample,2,skewness)
# Are there any values < -1.0 or > 1.0?
# Testing for normality.
# 1. Shapiro-Wilk test
library(stats)
shapiro.test(PCASample[,1]) #...keep going for each variable
# If the p-value is significant, you probably do not have a normal distribution.
# 2. Kolmogorov-Smirnov.
library(stats)
x <- rnorm(length(PCASample[,1]), mean = mean(PCASample[,1]), sd = sd(PCASample[,1])) # Creating a random sample from a normal distribution to compare the data with.
ks.test(x,PCASample[,1]) #...keep going for each variable
# If the p-value is significant, you probably do not have a normal distribution.
# Testing for outliers.
# Boxplot
boxplot(PCASample) # Careful! What does the graphic really mean?
# Normal probability plot
library(stats)
qqnorm(PCASample[,1])
qqline(PCASample[,1]) #...keep going for each variable
# Principal Components Analysis
# There are three ways to do this.
# The first two are ready-made programs in R that give you a lot of things you do NOT
# need and very little that you do need.
# 1. princomp
#PCAOut1 <- princomp(PCASample, cor = TRUE)
#PCAOut1$loadings[1:20,1:3] #gives the loadings or the eigenvectors?
#PCAOut1$scores #gives the scores?
# 2. prcomp
#PCAOut2 <- prcomp(PCASample, cor = TRUE)
#PCAOut2$rotation[1:20,1:3] #gives the loadings or the eigenvectors?
# 3. Doing it by hand (almost).
# a. Get the correlation or covariance matrix.
corPCAsample <- cor(PCASample)
covPCAsample <- cov(PCASample)
# b. Perform an eigenanalysis...PROBABLY on the correlation matrix.
eigcorPCAsample <- eigen(corPCAsample)
eigcorPCAsample$values #eigenvalues (lambda)
eigcorPCAsample$vectors #eigenvectors (PC weights)
# How many PCs to keep?
# 1. scree plot
plot(x = 1:Psamp,y = eigcorPCAsample$values)
# 2. broken stick
# calculate random eigenvalue vector
random_eigenvalue <- vector(length = 0)
for (i in 1:Psamp){
random_eigenvalue <- c(random_eigenvalue,sum(1/(i:Psamp)))
}
# compare the calculated and random eigenvalues
which(eigcorPCAsample$values > random_eigenvalue)
# 3. relative percent variance
RPV <- eigcorPCAsample$values/Psamp
#####################################
# Helpful tips for homework.
# a HINT for doing many iterations.
# 1. determine how many iterations you want to do.
NIter <- ???
# 2. define the vector and/or matrix you want to fill
VectIter <- vector(length = NIter)
MatrixIter <- matrix(nrow = NIter, ncol = ???)
# 3. fill the vector or matrix via a loop
for (i in 1:NIter){
# .....do all your calculations here
VectIter[i] <- ???
MatrixIter[i,] <- ???
}
# Some random number generators
x <- 1:5
sample(x) # sample all without replacement, just jumbles the numbers
sample(x,3) # sample three numbers from x without replacement
sample(x,10,replace = TRUE) # sample 10 numbers from x with replacement
#####################################
# Interpretation
# 1. Loadings
load_eigcorPCAsample <- matrix(ncol = Psamp,
nrow = Psamp)
# Give the loadings row and column names.
dimnames(load_eigcorPCAsample) <- list(Variables = names(PCAData), PC = 1:Psamp)
for (i in 1:Psamp) {
load_eigcorPCAsample[i,] <-eigcorPCAsample$vectors[i,]*sqrt(eigcorPCAsample$values)
}
# 2. Scores
# = sum(Eigenvector column * Data row adjusted)
# Standardize the data.
standardize <- function(x) {(x - mean(x))/sd(x)}
PCASamp_stnd <- apply(PCASample, MARGIN=2, FUN=standardize)
# Calculate the score.
score_eigcorPCAsample <- PCASamp_stnd%*%eigcorPCAsample$vectors # The %*% is matrix multiplication.
# Look at component plots to test linearity assumption.
# Everything should look like a cloud.
plot(score_eigcorPCAsample[,1],score_eigcorPCAsample[,2])
plot(PCASample[,1],PCASample[,2])