# Setting the working directory. setwd("C:/...") # This is unique for each computer or person. # Download data. PCASample <- read.csv(file="....csv",header=TRUE) # Setting up the data. Nsamp <- length(PCASample[,1]) # The number of entities in your data (number of rows). Psamp <- length(PCASample[1,]) # The number of variables of interest (number of columns). # Testing for skewness. library(moments) apply(PCASample,2,skewness) # Are there any values < -1.0 or > 1.0? # Testing for normality. # 1. Shapiro-Wilk test library(stats) shapiro.test(PCASample[,1]) #...keep going for each variable # If the p-value is significant, you probably do not have a normal distribution. # 2. Kolmogorov-Smirnov. library(stats) x <- rnorm(length(PCASample[,1]), mean = mean(PCASample[,1]), sd = sd(PCASample[,1])) # Creating a random sample from a normal distribution to compare the data with. ks.test(x,PCASample[,1]) #...keep going for each variable # If the p-value is significant, you probably do not have a normal distribution. # Testing for outliers. # Boxplot boxplot(PCASample) # Careful! What does the graphic really mean? # Normal probability plot library(stats) qqnorm(PCASample[,1]) qqline(PCASample[,1]) #...keep going for each variable # Principal Components Analysis # There are three ways to do this. # The first two are ready-made programs in R that give you a lot of things you do NOT # need and very little that you do need. # 1. princomp #PCAOut1 <- princomp(PCASample, cor = TRUE) #PCAOut1$loadings[1:20,1:3] #gives the loadings or the eigenvectors? #PCAOut1$scores #gives the scores? # 2. prcomp #PCAOut2 <- prcomp(PCASample, cor = TRUE) #PCAOut2$rotation[1:20,1:3] #gives the loadings or the eigenvectors? # 3. Doing it by hand (almost). # a. Get the correlation or covariance matrix. corPCAsample <- cor(PCASample) covPCAsample <- cov(PCASample) # b. Perform an eigenanalysis...PROBABLY on the correlation matrix. eigcorPCAsample <- eigen(corPCAsample) eigcorPCAsample$values #eigenvalues (lambda) eigcorPCAsample$vectors #eigenvectors (PC weights) # How many PCs to keep? # 1. scree plot plot(x = 1:Psamp,y = eigcorPCAsample$values) # 2. broken stick # calculate random eigenvalue vector random_eigenvalue <- vector(length = 0) for (i in 1:Psamp){ random_eigenvalue <- c(random_eigenvalue,sum(1/(i:Psamp))) } # compare the calculated and random eigenvalues which(eigcorPCAsample$values > random_eigenvalue) # 3. relative percent variance RPV <- eigcorPCAsample$values/Psamp ##################################### # Helpful tips for homework. # a HINT for doing many iterations. # 1. determine how many iterations you want to do. NIter <- ??? # 2. define the vector and/or matrix you want to fill VectIter <- vector(length = NIter) MatrixIter <- matrix(nrow = NIter, ncol = ???) # 3. fill the vector or matrix via a loop for (i in 1:NIter){ # .....do all your calculations here VectIter[i] <- ??? MatrixIter[i,] <- ??? } # Some random number generators x <- 1:5 sample(x) # sample all without replacement, just jumbles the numbers sample(x,3) # sample three numbers from x without replacement sample(x,10,replace = TRUE) # sample 10 numbers from x with replacement ##################################### # Interpretation # 1. Loadings load_eigcorPCAsample <- matrix(ncol = Psamp, nrow = Psamp) # Give the loadings row and column names. dimnames(load_eigcorPCAsample) <- list(Variables = names(PCAData), PC = 1:Psamp) for (i in 1:Psamp) { load_eigcorPCAsample[i,] <-eigcorPCAsample$vectors[i,]*sqrt(eigcorPCAsample$values) } # 2. Scores # = sum(Eigenvector column * Data row adjusted) # Standardize the data. standardize <- function(x) {(x - mean(x))/sd(x)} PCASamp_stnd <- apply(PCASample, MARGIN=2, FUN=standardize) # Calculate the score. score_eigcorPCAsample <- PCASamp_stnd%*%eigcorPCAsample$vectors # The %*% is matrix multiplication. # Look at component plots to test linearity assumption. # Everything should look like a cloud. plot(score_eigcorPCAsample[,1],score_eigcorPCAsample[,2]) plot(PCASample[,1],PCASample[,2])