canid.dat <- rbind(c(9.7, 21.0, 19.4, 7.7, 32.0, 36.5), c(8.1, 16.7, 18.3, 7.0, 30.3, 32.9), c(13.5, 27.3, 26.8, 10.6, 41.9, 48.1), c(11.5, 24.3, 24.5, 9.3, 40.0, 44.6), c(10.7, 23.5, 21.4, 8.5, 28.8, 37.6), c(9.6, 22.6, 21.2, 8.3, 34.4, 43.1), c(10.3, 22.1, 19.1, 8.1, 32.3, 35.0)) # I. k-means clustering # NOTE: This method is not stable with a small dataset since starting values # of centroids are random, and the starting value will make a # difference. Example, rerun the data several times and look at the # cluster assignments. kmean_canid <- kmeans(canid.dat, centers = 3) kmean_canid$cluster # which cluster an entity belongs to. kmean_canid$centers # multidimensional centroid of each cluster kmean_canid$withinss # variance for a cluster kmean_canid$size # number of entities in a cluster #II. hierarchical library(MVA) # Matrix filled with Euclidean distances canid.dist <- dist(canid.dat, method = "euclidean") canid.dist # Cluster analysis using average linkage canid.hclust <- hclust(canid.dist, method = "average") # Plot the dendrogram plot(canid.hclust) # III. clValid # nClust >1 and < number of entities. # The canid example does not work. # because there is a bug in the code # that can not handle values of zero # when calculating connectivity. library(clValid) canid.lots <- clValid(canid.dat, nClust= 2:6, clMethods = c("hierarchical", "kmeans"), validation = c("internal", "stability"), metric = "euclidean", method = "average") summary(canid.lots) optimalScores(canid.lots) plot(canid.lots) # Let's try again with different data. library(cluster) library(clValid) # Pulling the data out. library(cluster.datasets) data(all.mammals.milk.1956) # Fixing the row names. rownames(all.mammals.milk.1956) <- all.mammals.milk.1956[,1] # Cluster analysis. # Note that we do NOT include the names in the analysis. milk.lots <- clValid(all.mammals.milk.1956[2:6], nClust= 2:20, clMethods = c("hierarchical", "kmeans"), validation = c("internal", "stability"), metric = "euclidean", method = "average") summary(milk.lots) optimalScores(milk.lots) # Plot the different method comparison metrics # for the different cluster methods and varying number of clusters. par(mfrow = c(2,4)) par(mai = c(0.4,0.7,0.2,0.2)) plot(milk.lots, legend = FALSE) # Plot the hierarchical dendrogram results. par(mfrow = c(1,1)) plot(milk.lots@clusterObjs$hierarchical)