# Setting the working directory. setwd("C:/Users/Lisa/Dropbox/CSUMB stats class") # This is unique for each computer or person. # Two categories, two variables to determine category. # Download data. DiscExample <- read.csv(file="DiscSample.csv",header=TRUE) # Means by category. # Category classification is in column 1. xbar_1 <- colMeans(DiscExample[DiscExample$Species == 1,2:3]) xbar_2 <- colMeans(DiscExample[DiscExample$Species == 2,2:3]) # Variance-covariance matrix by category. S1 <- cov(DiscExample[DiscExample$Species == 1,2:3]) S2 <- cov(DiscExample[DiscExample$Species == 2,2:3]) # Pooled variance-covariance matrix. # Determine how many datapoints you have for each category. cat_count <- table(DiscExample$Species) # Calculate Spooled. Spooled <- (((cat_count[1]-1)*S1)+((cat_count[2]-1)*S2))/(sum(cat_count)-2) # Calculate coefficients (a). library(MASS) a <- ginv(Spooled)%*%(xbar_1-xbar_2) # Transfer the datapoints on to the new axis. y <- as.matrix(DiscExample[,2:3])%*%a # Calculate new means by category. ybar_1 <- mean(y[DiscExample$Species == 1]) ybar_2 <- mean(y[DiscExample$Species == 2]) # Calculate cutoff value for new data. cutoff <- 0.5*(ybar_1+ybar_2) # New datapoint. x0 <- matrix(c(37,59), nrow = 1, ncol = 2) # Calculate y0. y0 <- x0%*%a # If less than cutoff, put in group 2. Otherwise, group 1. y0 < cutoff # This one should be in group 2. ################# # Let's do it using the lda (linear discriminant analysis) function. # The new data have to be in dataframe format and have named columns. x0dataframe <- as.data.frame(x0) colnames(x0dataframe) <- c("AgeRepro","MassAgeRepro") # "prior" is where you can put your assumed probability beforehand. lda.test <- lda(formula = Species ~ AgeRepro + MassAgeRepro, data = DiscExample, prior = c(0.5,0.5)) predict(lda.test, x0dataframe) # How well did lda perform? # Cross-validation method. lda.CV <- lda(formula = Species ~ AgeRepro + MassAgeRepro, data = DiscExample, prior = c(0.5,0.5), CV = TRUE) # Create a contigency table. # True group membership is in the rows. Predicted group membership is the columns. lda.table <- table(DiscExample[,1],lda.CV$class) # What if we don't assume equal variance-covariance structure? # Then go to quadratic discriminant analysis. qda.test <- qda(formula = Species ~ AgeRepro + MassAgeRepro, data = DiscExample, prior = c(0.5,0.5)) summary(qda.test) predict(qda.test, x0dataframe) # How well did qda perform? # Cross-validation method. qda.CV <- qda(formula = Species ~ AgeRepro + MassAgeRepro, data = DiscExample, prior = c(0.5,0.5), CV = TRUE) # Create a contigency table. # True group membership is in the rows. Predicted group membership is the columns. qda.table <- table(DiscExample[,1],qda.CV$class) # Let's try it with 3 groups and 4 variables. data("iris") lda.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris) lda.CV.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, CV = TRUE) # You can get most of what you want to look at just by looking at these results. lda.iris # Create a contingency table. # True group membership is in the rows. Predicted group membership is the columns. lda.table.iris <- table(iris$Species,lda.CV.iris$class) # You can also see how the groups have separated out by plotting the results. plot(lda.iris) # qda qda.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris) qda.CV.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, CV = TRUE) # Create a contingency table. qda.table.iris <- table(iris$Species,qda.CV.iris$class) # Interpretation: # Based on cross-validation results, linear discriminant analysis performs better (error rate = 0.020) # compared to quadratic discriminant analysis (error rate = 0.026). Looking at the coefficients of linear # discriminants (lda.iris$scaling), LD1 represents two types of data. "Sepal size" includes sepal length # and width. "Petal size" includes petal width and length. Based on the means (lda.iris$means), flowers # are categorized as group 1 (Iris setosa)when "sepal size" is large and "petal size" is small. When # sepal and petal sizes are about the same, flowers are categorized as group 2 (I. versicolor). LD2 also # represents two types of data. "Width" includes sepal and petal width. Petal length length is the other # type of data. Sepal length does not contribute much to LD2. However, flower groups are difficult to # distinguish when looking at LD2 (proportion of trace = 0.0088). Also, based on the plots, the second # discriminant function does not help to categorize the data. When projecting the data on to the y-axis, # the groups are practically indistinguishable from each other. # If qda is more accurate, variable descriptions are much more difficult to interpret. However, you can # still pull out the same information from the data means. # Predicting for a new datapoint with measured variables but we don't know the group it should go in to. x0.iris <- as.data.frame(matrix(c(6.0,3.1,4.0,1.0), nrow = 1, ncol = 4)) colnames(x0.iris) <- c("Sepal.Length","Sepal.Width", "Petal.Length","Petal.Width") x0.iris predict(lda.iris,x0.iris) predict(qda.iris,x0.iris)