Libraries

library(dplyr)
library(cluster)
library(ggplot2)
library(dbscan)

Load Data

Loading the wine dataset, summarizing it, and setting a standard seed.

wines <- read.csv("/Users/dylandoyle-rowan/Downloads/Wine-clustering-HC.csv")

summary(wines)
##     Alcohol        Malic_Acid         Ash         Ash_Alcanity  
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium      Total_Phenols     Flavanoids    Nonflavanoid_Phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  Proanthocyanins Color_Intensity       Hue             OD280      
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline       Alcohol_Level      Ash_Content        Color_Intensity_Group
##  Min.   : 278.0   Length:178         Length:178         Length:178           
##  1st Qu.: 500.5   Class :character   Class :character   Class :character     
##  Median : 673.5   Mode  :character   Mode  :character   Mode  :character     
##  Mean   : 746.9                                                              
##  3rd Qu.: 985.0                                                              
##  Max.   :1680.0
set.seed(123)

Prepare the Data

Removing all categorical values from the dataset since they can’t be used in AGNES or DIANA.

numeric_vars <- wines %>%
  select(-Alcohol_Level, -Ash_Content, -Color_Intensity_Group)

Scale the Data

num_scaled <- scale(numeric_vars)

Distance Matrix

d <- dist(num_scaled, method = "euclidean")

Question 1

AGNES Methods

agnes_single <- agnes(num_scaled, method = "single")
agnes_complete <- agnes(num_scaled, method = "complete")
agnes_average <- agnes(num_scaled, method = "average")
agnes_ward <- agnes(num_scaled, method = "ward")

Agglomerative Coefficients

print(paste("AGNES - single AC:", round(agnes_single$ac, 4)))
## [1] "AGNES - single AC: 0.5379"
print(paste("AGNES - complete AC:", round(agnes_complete$ac, 4)))
## [1] "AGNES - complete AC: 0.8159"
print(paste("AGNES - average AC:", round(agnes_average$ac, 4)))
## [1] "AGNES - average AC: 0.7007"
print(paste("AGNES - ward AC:", round(agnes_ward$ac, 4)))
## [1] "AGNES - ward AC: 0.9419"

Dendrogram Plots

par(mfrow = c(1, 1))
par(mar = c(2, 2, 2, 2))
pltree(agnes_single, main = "AGNES - Single Linkage", cex = 0.7)

pltree(agnes_complete, main = "AGNES - Complete Linkage", cex = 0.7)

pltree(agnes_average, main = "AGNES - Average Linkage", cex = 0.7)

pltree(agnes_ward, main = "AGNES - Ward Linkage", cex = 0.7)

DIANA

diana_res <- diana(num_scaled)
print(paste("DIANA DC:", round(diana_res$dc, 4)))
## [1] "DIANA DC: 0.8"

Plot

pltree(diana_res, main = "DIANA (Divisive) Dendrogram", cex = 0.6)

Q1-B - Comparing Agglomerative Coefficients

agnes_single$ac
## [1] 0.5379128
agnes_complete$ac
## [1] 0.815931
agnes_average$ac
## [1] 0.7006964
agnes_ward$ac
## [1] 0.9419172
diana_res$dc
## [1] 0.8000096

The AGNES Ward strategy is the most cohesive clustering structure because it has the highest agglomerative coefficient.

Question 2

Q2-A - Median of Alcohol Level

Median_Alcohol <- wines %>%
  group_by(Alcohol_Level) %>%
  summarise(Median_Alcohol = median(Alcohol, na.rm = TRUE))

Median_Alcohol

Median of Ash

Median_Ash <- wines %>%
  group_by(Ash_Content) %>%
  summarise(Median_Ash = median(Ash, na.rm = TRUE))

Median_Ash

Median of Color Intensity

Median_ColorIntensity <- wines %>%
  group_by(Color_Intensity_Group) %>%
  summarise(Median_Color_Intensity = median(Color_Intensity, na.rm = TRUE))

Median_ColorIntensity

Bar Plots

Median Alcohol

Med_alcplot <- ggplot(Median_Alcohol, aes(x = Alcohol_Level, y = Median_Alcohol)) +
  geom_col() +
  labs(title = "Median Alcohol",
       x = "Alcohol Level",
       y = "Median Alcohol (%)") +
  theme_minimal()

Med_alcplot

Median Ash

Med_Ashplot <- ggplot(Median_Ash, aes(x = Ash_Content, y = Median_Ash)) +
  geom_col() +
  labs(title = "Median Ash by Ash Content",
       x = "Ash Content Category",
       y = "Median Ash") +
  theme_minimal()

Med_Ashplot

Median Color Intensity

Med_Colorplot <- ggplot(Median_ColorIntensity, aes(x = Color_Intensity_Group, y = Median_Color_Intensity)) +
  geom_col() +
  labs(title = "Median Color Intensity by Color Intensity Group",
       x = "Color Intensity Group",
       y = "Median Color Intensity") +
  theme_minimal()

Med_Colorplot

Q2-C - Interpretation

For the Alcohol Percentage median chart, the low to medium to high ratios are correct, which proves that the category names align with the real alcohol measurements.

For the Ash Content median chart, the High bar is the highest, which also proves the accuracy of the data.

For the Color Median chart, you can see that the dark group has the highest median color intensity, which is accurate, and the light has the lowest color intensity, which is also accurate.

Question 3

Q3-A - Counting Clusters

agnes_clusters <- cutree(agnes_complete, k = 3)
table(agnes_clusters)
## agnes_clusters
##  1  2  3 
## 69 58 51
diana_clusters <- cutree(diana_res, k = 3)
table(diana_clusters)
## diana_clusters
##  1  2  3 
## 91 38 49

This shows that AGNES formed 3 clusters of sizes 69, 58, and 51. These are more balanced and have smooth separation.

DIANA formed 3 clusters of sizes 91, 38, and 49. DIANA formed one group that was very internally similar and split off into smaller more distinct groups.

I believe that AGNES Ward is the best hierarchical clustering technique because it has the highest agglomerative coefficient.

Q3-B - Would Another Method Be Better?

I don’t think any other method would be better. K-Means is very sensitive to outliers and scaling, and DBSCAN is more geared towards finding the density of data. For this specific dataset, I believe that AGNES is the best strategy to use.