# This example and code comes from http://www.ats.ucla.edu/stat/r/dae/canonical.htm
#Example. A researcher has collected data on three psychological variables,
# four academic variables (standardized test scores) and gender for
# 600 college freshman. She is interested in how the set of psychological
# variables relates to the academic variables and gender. In particular,
# the researcher is interested in how many dimensions (canonical variables)
# are necessary to understand the association between the two sets of variables.
# This analysis requires three packages.
# You will get all sorts of warnings if you don't have the latest
# version of R.
library(ggplot2)
library(GGally)
library(CCA)
# Set the working directory.
setwd("C:/Users/Lisa/Dropbox/CSUMB stats class") # This is unique for each computer or person.
# Download data.
mm <- read.csv(file="mmreg.csv",header=TRUE)
# Rename the data columns.
colnames(mm) <- c("Control", "Concept", "Motivation", "Read", "Write", "Math",
"Science", "Sex")
# Look at the data a little.
summary(mm)
# Contingency table by sex.
table(mm$Sex)
# Here's another command to create a contingency table that is more flexible.
# Use help if you would like to learn more about it.
xtabs(~Sex, data = mm)
# Separate the data in to the psychlogical data and the academic + sex data.
psych <- mm[, 1:3]
acad <- mm[, 4:8]
# A really cool way to look at 1x1 graphs.
ggpairs(psych)
ggpairs(acad)
ggpairs(mm)
# Look at the correlations within and between the two sets of variables.
matcor(psych, acad)
# Run the canonical correlation analysis.
cc1 <- cc(psych, acad)
# Display the canonical correlations.
cc1$cor
# Raw canonical coefficients.
# Again, we don't use these really for interpretation.
# However, they can be discussed just as regression coefficients are discussed.
cc1$xcoef
cc1$ycoef
# Determining the number of significant dimensions.
# Wilke's lambda test.
ev <- (1 - cc1$cor^2)
n <- length(psych[,1]) # Number of entities.
p <- length(psych[1,]) # Number of variables in psych group.
q <- length(acad[1,]) # Number of variables in acad group.
k <- min(p, q) # The total number of dimensions you can have.
m <- n - 3/2 - (p + q)/2
# Calculating the Wilke's lambda value.
w <- rev(cumprod(rev(ev)))
# Define a bunch of vectors where things will be stored.
# All vectors have the same properties: numeric of length k.
# d1 and d2 will be degrees of freedom for the significance tests.
# f will be the F statistic.
d1 <- d2 <- f <- vector("numeric", k)
ploop <- p
qloop <- q
for (i in 1:k) {
s <- sqrt((ploop^2 * qloop^2 - 4)/(ploop^2 + qloop^2 - 5))
si <- 1/s
d1[i] <- ploop * qloop
d2[i] <- m * s - ploop * qloop/2 + 1
r <- (1 - w[i]^si)/w[i]^si
f[i] <- r * d2[i]/d1[i]
ploop <- ploop - 1
qloop <- qloop - 1
}
# Determining the significance of the F statistic from an F distribution.
pv <- pf(f, d1, d2, lower.tail = FALSE)
# Putting it all in a pretty table.
(dmat <- cbind(WilksL = w, F = f, df1 = d1, df2 = d2, p = pv))
# The first row measures the significance of all dimensions.
# Second row, significance of dimensions 2 and 3 combined.
# Third row, significance of dimension 3 by itself.
# Calculate the standardized canonical coefficients when variables have
# very different standard deviations.
s1 <- diag(sqrt(diag(cov(psych))))
stnd_xcoef <- s1 %*% cc1$xcoef
s2 <- diag(sqrt(diag(cov(acad))))
stnd_ycoef <- s2 %*% cc1$ycoef
# Rotating the standardized data and plotting it.
psych_st <- scale(psych, center = TRUE, scale = TRUE)
acad_st <- scale(acad, center = TRUE, scale = TRUE)
U1 <- psych_st%*%stnd_xcoef[,1]
V1 <- acad_st%*%stnd_ycoef[,1]
U2 <- psych_st%*%stnd_xcoef[,2]
V2 <- acad_st%*%stnd_ycoef[,2]
plot(U1, V1)
plot(U2, V2)