canid.dat <- rbind(c(9.7, 21.0, 19.4, 7.7, 32.0, 36.5),
                   c(8.1, 16.7, 18.3, 7.0, 30.3, 32.9),
                   c(13.5, 27.3, 26.8, 10.6, 41.9, 48.1),
                   c(11.5, 24.3, 24.5, 9.3, 40.0, 44.6),
                   c(10.7, 23.5, 21.4, 8.5, 28.8, 37.6),
                   c(9.6, 22.6, 21.2, 8.3, 34.4, 43.1),
                   c(10.3, 22.1, 19.1, 8.1, 32.3, 35.0))




# I. k-means clustering
# NOTE: This method is not stable with a small dataset since starting values
#       of centroids are random, and the starting value will make a
#       difference. Example, rerun the data several times and look at the
#       cluster assignments.
kmean_canid <- kmeans(canid.dat, centers = 3)
kmean_canid$cluster # which cluster an entity belongs to.
kmean_canid$centers # multidimensional centroid of each cluster
kmean_canid$withinss # variance for a cluster
kmean_canid$size # number of entities in a cluster


#II. hierarchical
library(MVA)
# Matrix filled with Euclidean distances
canid.dist <- dist(canid.dat, method = "euclidean")
canid.dist
# Cluster analysis using average linkage
canid.hclust <- hclust(canid.dist, method = "average")
# Plot the dendrogram
plot(canid.hclust)


# III. clValid
# nClust >1 and < number of entities.

# The canid example does not work.
# because there is a bug in the code 
# that can not handle values of zero
# when calculating connectivity.
library(clValid)
canid.lots <- clValid(canid.dat, nClust= 2:6, 
                      clMethods = c("hierarchical", "kmeans"),
                      validation = c("internal", "stability"),
                      metric = "euclidean", method = "average")
summary(canid.lots)
optimalScores(canid.lots)
plot(canid.lots)


# Let's try again with different data.
library(cluster)
library(clValid)
# Pulling the data out.
library(cluster.datasets)
data(all.mammals.milk.1956)
# Fixing the row names.
rownames(all.mammals.milk.1956) <- all.mammals.milk.1956[,1]
# Cluster analysis.
# Note that we do NOT include the names in the analysis.
milk.lots <- clValid(all.mammals.milk.1956[2:6], nClust= 2:20, 
                      clMethods = c("hierarchical", "kmeans"),
                      validation = c("internal", "stability"),
                      metric = "euclidean", method = "average")
summary(milk.lots)
optimalScores(milk.lots)
# Plot the different method comparison metrics
#  for the different cluster methods and varying number of clusters.
par(mfrow = c(2,4))
par(mai = c(0.4,0.7,0.2,0.2))
plot(milk.lots, legend = FALSE)
# Plot the hierarchical dendrogram results.
par(mfrow = c(1,1))
plot(milk.lots@clusterObjs$hierarchical)