# This example and code comes from http://www.ats.ucla.edu/stat/r/dae/canonical.htm #Example. A researcher has collected data on three psychological variables, # four academic variables (standardized test scores) and gender for # 600 college freshman. She is interested in how the set of psychological # variables relates to the academic variables and gender. In particular, # the researcher is interested in how many dimensions (canonical variables) # are necessary to understand the association between the two sets of variables. # This analysis requires three packages. # You will get all sorts of warnings if you don't have the latest # version of R. library(ggplot2) library(GGally) library(CCA) # Set the working directory. setwd("C:/Users/Lisa/Dropbox/CSUMB stats class") # This is unique for each computer or person. # Download data. mm <- read.csv(file="mmreg.csv",header=TRUE) # Rename the data columns. colnames(mm) <- c("Control", "Concept", "Motivation", "Read", "Write", "Math", "Science", "Sex") # Look at the data a little. summary(mm) # Contingency table by sex. table(mm$Sex) # Here's another command to create a contingency table that is more flexible. # Use help if you would like to learn more about it. xtabs(~Sex, data = mm) # Separate the data in to the psychlogical data and the academic + sex data. psych <- mm[, 1:3] acad <- mm[, 4:8] # A really cool way to look at 1x1 graphs. ggpairs(psych) ggpairs(acad) ggpairs(mm) # Look at the correlations within and between the two sets of variables. matcor(psych, acad) # Run the canonical correlation analysis. cc1 <- cc(psych, acad) # Display the canonical correlations. cc1$cor # Raw canonical coefficients. # Again, we don't use these really for interpretation. # However, they can be discussed just as regression coefficients are discussed. cc1$xcoef cc1$ycoef # Determining the number of significant dimensions. # Wilke's lambda test. ev <- (1 - cc1$cor^2) n <- length(psych[,1]) # Number of entities. p <- length(psych[1,]) # Number of variables in psych group. q <- length(acad[1,]) # Number of variables in acad group. k <- min(p, q) # The total number of dimensions you can have. m <- n - 3/2 - (p + q)/2 # Calculating the Wilke's lambda value. w <- rev(cumprod(rev(ev))) # Define a bunch of vectors where things will be stored. # All vectors have the same properties: numeric of length k. # d1 and d2 will be degrees of freedom for the significance tests. # f will be the F statistic. d1 <- d2 <- f <- vector("numeric", k) ploop <- p qloop <- q for (i in 1:k) { s <- sqrt((ploop^2 * qloop^2 - 4)/(ploop^2 + qloop^2 - 5)) si <- 1/s d1[i] <- ploop * qloop d2[i] <- m * s - ploop * qloop/2 + 1 r <- (1 - w[i]^si)/w[i]^si f[i] <- r * d2[i]/d1[i] ploop <- ploop - 1 qloop <- qloop - 1 } # Determining the significance of the F statistic from an F distribution. pv <- pf(f, d1, d2, lower.tail = FALSE) # Putting it all in a pretty table. (dmat <- cbind(WilksL = w, F = f, df1 = d1, df2 = d2, p = pv)) # The first row measures the significance of all dimensions. # Second row, significance of dimensions 2 and 3 combined. # Third row, significance of dimension 3 by itself. # Calculate the standardized canonical coefficients when variables have # very different standard deviations. s1 <- diag(sqrt(diag(cov(psych)))) stnd_xcoef <- s1 %*% cc1$xcoef s2 <- diag(sqrt(diag(cov(acad)))) stnd_ycoef <- s2 %*% cc1$ycoef # Rotating the standardized data and plotting it. psych_st <- scale(psych, center = TRUE, scale = TRUE) acad_st <- scale(acad, center = TRUE, scale = TRUE) U1 <- psych_st%*%stnd_xcoef[,1] V1 <- acad_st%*%stnd_ycoef[,1] U2 <- psych_st%*%stnd_xcoef[,2] V2 <- acad_st%*%stnd_ycoef[,2] plot(U1, V1) plot(U2, V2)