# Setting the working directory.
setwd("C:/Users/Lisa/Dropbox/CSUMB stats class")  # This is unique for each computer or person.


# Two categories, two variables to determine category.
# Download data.
DiscExample <- read.csv(file="DiscSample.csv",header=TRUE)

# Means by category.
# Category classification is in column 1.
xbar_1 <- colMeans(DiscExample[DiscExample$Species == 1,2:3])
xbar_2 <- colMeans(DiscExample[DiscExample$Species == 2,2:3])

# Variance-covariance matrix by category.
S1 <- cov(DiscExample[DiscExample$Species == 1,2:3])
S2 <- cov(DiscExample[DiscExample$Species == 2,2:3])

# Pooled variance-covariance matrix.
# Determine how many datapoints you have for each category.
cat_count <- table(DiscExample$Species)
# Calculate Spooled.
Spooled <- (((cat_count[1]-1)*S1)+((cat_count[2]-1)*S2))/(sum(cat_count)-2)

# Calculate coefficients (a).
library(MASS)
a <- ginv(Spooled)%*%(xbar_1-xbar_2)

# Transfer the datapoints on to the new axis.
y <- as.matrix(DiscExample[,2:3])%*%a

# Calculate new means by category.
ybar_1 <- mean(y[DiscExample$Species == 1])
ybar_2 <- mean(y[DiscExample$Species == 2])

# Calculate cutoff value for new data.
cutoff <- 0.5*(ybar_1+ybar_2)

# New datapoint.
x0 <- matrix(c(37,59), nrow = 1, ncol = 2)

# Calculate y0.
y0 <- x0%*%a

# If less than cutoff, put in group 2. Otherwise, group 1.
y0 < cutoff
#  This one should be in group 2.


#################
# Let's do it using the lda (linear discriminant analysis) function.
# The new data have to be in dataframe format and have named columns.
x0dataframe <- as.data.frame(x0)
colnames(x0dataframe) <- c("AgeRepro","MassAgeRepro")
# "prior" is where you can put your assumed probability beforehand.
lda.test <- lda(formula = Species ~ AgeRepro + MassAgeRepro,
                data = DiscExample, prior = c(0.5,0.5))
predict(lda.test, x0dataframe)

# How well did lda perform?
# Cross-validation method.
lda.CV <- lda(formula = Species ~ AgeRepro + MassAgeRepro,
              data = DiscExample, prior = c(0.5,0.5), CV = TRUE)
#  Create a contigency table.
# True group membership is in the rows. Predicted group membership is the columns.
lda.table <- table(DiscExample[,1],lda.CV$class)

# What if we don't assume equal variance-covariance structure?
#   Then go to quadratic discriminant analysis.
qda.test <- qda(formula = Species ~ AgeRepro + MassAgeRepro,
                data = DiscExample, prior = c(0.5,0.5))
summary(qda.test)
predict(qda.test, x0dataframe)

# How well did qda perform?
# Cross-validation method.
qda.CV <- qda(formula = Species ~ AgeRepro + MassAgeRepro,
              data = DiscExample, prior = c(0.5,0.5), CV = TRUE)
#  Create a contigency table.
# True group membership is in the rows. Predicted group membership is the columns.
qda.table <- table(DiscExample[,1],qda.CV$class)


# Let's try it with 3 groups and 4 variables.
data("iris")
lda.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width +
                     Petal.Length + Petal.Width,
                   data = iris)
lda.CV.iris <- lda(formula = Species ~ Sepal.Length + Sepal.Width +
                     Petal.Length + Petal.Width,
                data = iris, CV = TRUE)
# You can get most of what you want to look at just by looking at these results.
lda.iris
#  Create a contingency table.
# True group membership is in the rows. Predicted group membership is the columns.
lda.table.iris <- table(iris$Species,lda.CV.iris$class)
# You can also see how the groups have separated out by plotting the results.
plot(lda.iris)

# qda
qda.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width +
                  Petal.Length + Petal.Width,
                data = iris)
qda.CV.iris <- qda(formula = Species ~ Sepal.Length + Sepal.Width +
                     Petal.Length + Petal.Width,
                   data = iris, CV = TRUE)
#  Create a contingency table.
qda.table.iris <- table(iris$Species,qda.CV.iris$class)

# Interpretation:
#  Based on cross-validation results, linear discriminant analysis performs better (error rate = 0.020)
#  compared to quadratic discriminant analysis (error rate = 0.026). Looking at the coefficients of linear
#  discriminants (lda.iris$scaling), LD1 represents two types of data. "Sepal size" includes sepal length
#  and width. "Petal size" includes petal width and length. Based on the means (lda.iris$means), flowers
#  are categorized as group 1 (Iris setosa)when "sepal size" is large and "petal size" is small. When
#  sepal and petal sizes are about the same, flowers are categorized as group 2 (I. versicolor). LD2 also
#  represents two types of data. "Width" includes sepal and petal width. Petal length length is the other
#  type of data. Sepal length does not contribute much to LD2. However, flower groups are difficult to
#  distinguish when looking at LD2 (proportion of trace = 0.0088). Also, based on the plots, the second
#  discriminant function does not help to categorize the data. When projecting the data on to the y-axis,
#  the groups are practically indistinguishable from each other.
#  If qda is more accurate, variable descriptions are much more difficult to interpret. However, you can
#   still pull out the same information from the data means.

# Predicting for a new datapoint with measured variables but we don't know the group it should go in to.
x0.iris <- as.data.frame(matrix(c(6.0,3.1,4.0,1.0), nrow = 1, ncol = 4))
colnames(x0.iris) <- c("Sepal.Length","Sepal.Width",
                         "Petal.Length","Petal.Width")
x0.iris

predict(lda.iris,x0.iris)
predict(qda.iris,x0.iris)