# Setting the working directory.
setwd("C:/Users/Lisa/Dropbox/CSUMB stats class") # This is unique for each computer or person.
# Two categories, two variables to determine category.
# Download data.
DiscExample <- read.csv(file="DiscSample.csv",header=TRUE)
# Means by category.
# Category classification is in column 1.
xbar_1 <- colMeans(DiscExample[DiscExample$Species == 1,2:3])
xbar_2 <- colMeans(DiscExample[DiscExample$Species == 2,2:3])
# Variance-covariance matrix by category.
S1 <- cov(DiscExample[DiscExample$Species == 1,2:3])
S2 <- cov(DiscExample[DiscExample$Species == 2,2:3])
# Pooled variance-covariance matrix.
# Determine how many datapoints you have for each category.
cat_count <- table(DiscExample$Species)
# Calculate Spooled.
Spooled <- (((cat_count[1]-1)*S1)+((cat_count[2]-1)*S2))/(sum(cat_count)-2)
# Calculate coefficients (a).
library(MASS)
a <- ginv(Spooled)%*%(xbar_1-xbar_2)
# Transfer the datapoints on to the new axis.
y <- as.matrix(DiscExample[,2:3])%*%a
# Calculate new means by category.
ybar_1 <- mean(y[DiscExample$Species == 1])
ybar_2 <- mean(y[DiscExample$Species == 2])
# Calculate cutoff value for new data.
cutoff <- 0.5*(ybar_1+ybar_2)
# New datapoint.
x0 <- matrix(c(37,59), nrow = 1, ncol = 2)
# Calculate y0.
y0 <- x0%*%a
# If less than cutoff, put in group 2. Otherwise, group 1.
y0 < cutoff
# This one should be in group 2.
#################
# Let's do it using the lda (linear discriminant analysis) function.
# The new data have to be in dataframe format and have named columns.
x0dataframe <- as.data.frame(x0)
colnames(x0dataframe) <- c("AgeRepro","MassAgeRepro")
# "prior" is where you can put your assumed probability beforehand.
lda.test <- lda(formula = Species ~ AgeRepro + MassAgeRepro,
data = DiscExample, prior = c(0.5,0.5))
predict(lda.test, x0dataframe)
# How well did lda perform?
# Cross-validation method.
lda.CV <- lda(formula = Species ~ AgeRepro + MassAgeRepro,
data = DiscExample, prior = c(0.5,0.5), CV = TRUE)
# Create a contigency table.
# True group membership is in the rows. Predicted group membership is the columns.
lda.table <- table(DiscExample[,1],lda.CV$class)
# What if we don't assume equal variance-covariance structure?
# Then go to quadratic discriminant analysis.
qda.test <- qda(formula = Species ~ AgeRepro + MassAgeRepro,
data = DiscExample, prior = c(0.5,0.5))
summary(qda.test)
predict(qda.test, x0dataframe)
# How well did qda perform?
# Cross-validation method.
qda.CV <- qda(formula = Species ~ AgeRepro + MassAgeRepro,
data = DiscExample, prior = c(0.5,0.5), CV = TRUE)
# Create a contigency table.
# True group membership is in the rows. Predicted group membership is the columns.
qda.table <- table(DiscExample[,1],qda.CV$class)
# Let's try it with 3 groups and 4 variables.
data("iris")
lda.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = iris)
lda.CV.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = iris, CV = TRUE)
# You can get most of what you want to look at just by looking at these results.
lda.iris
# Create a contingency table.
# True group membership is in the rows. Predicted group membership is the columns.
lda.table.iris <- table(iris$Species,lda.CV.iris$class)
# You can also see how the groups have separated out by plotting the results.
plot(lda.iris)
# qda
qda.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = iris)
qda.CV.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = iris, CV = TRUE)
# Create a contingency table.
qda.table.iris <- table(iris$Species,qda.CV.iris$class)
# Interpretation:
# Based on cross-validation results, linear discriminant analysis performs better (error rate = 0.020)
# compared to quadratic discriminant analysis (error rate = 0.026). Looking at the coefficients of linear
# discriminants (lda.iris$scaling), LD1 represents two types of data. "Sepal size" includes sepal length
# and width. "Petal size" includes petal width and length. Based on the means (lda.iris$means), flowers
# are categorized as group 1 (Iris setosa)when "sepal size" is large and "petal size" is small. When
# sepal and petal sizes are about the same, flowers are categorized as group 2 (I. versicolor). LD2 also
# represents two types of data. "Width" includes sepal and petal width. Petal length length is the other
# type of data. Sepal length does not contribute much to LD2. However, flower groups are difficult to
# distinguish when looking at LD2 (proportion of trace = 0.0088). Also, based on the plots, the second
# discriminant function does not help to categorize the data. When projecting the data on to the y-axis,
# the groups are practically indistinguishable from each other.
# If qda is more accurate, variable descriptions are much more difficult to interpret. However, you can
# still pull out the same information from the data means.
# Predicting for a new datapoint with measured variables but we don't know the group it should go in to.
x0.iris <- as.data.frame(matrix(c(6.0,3.1,4.0,1.0), nrow = 1, ncol = 4))
colnames(x0.iris) <- c("Sepal.Length","Sepal.Width",
"Petal.Length","Petal.Width")
x0.iris
predict(lda.iris,x0.iris)
predict(qda.iris,x0.iris)