News

New paper! in the American Naturalist

Monday, October 12, 2020

Hierarchical Clustering (classification, unsupervised)



"""
Codes from "COGNITIVE CLASS.ai - Hierarchical Clustering" , author: Saeed Aghabozorgi

Agglomerative Clustering is the bottom up approach - cluster a pair of data points closest in distance together, and repeat this process to create a hierarchical tree.
"""

import numpyas np
import pandas as pd
from scipy import ndimage
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
from matplotlib import pyplot as plt
import pylab
from scipy.cluster.hierarchy import fcluster
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets.samples_generator import make_blobs

##### Step1: Data & Normalization #####
df = pd.read_csv("data.csv",delimiter=' ')
featureset = df[['X1','X2',...]]
from sklearn.preprocessing import MinMaxScaler
# MinMaxScaler transforms features by scaling each feature to a given range. It is by default (0,1)
x = featureset.values # returns a numpy array
feature_mtx = MinMaxScaler().fit_transform(x)

##### Step2: Clustering using Scipy #####
import scipy
leng = feature_mtx.shape[0]
D = scipy.zeros([leng,leng])
for i in range(leng):
	for j in range(leng):
		D[i,j] = scipy.spatial.distance.euclidean(feature_mtx[i], feature_mtx[j])

# In agglomerative clustering, at each iteration, the algorithm must update the distance matrix to reflect the distance of the newly formed cluster with the remaining clusters in the forest. The following methodsa are supported in Scipy for calculating the distance between the newly formed cluster and each:
# single, complete, average, weighted, centroid
Z = hierarchy.linkage(D, 'complete')
# Hierarchical clustering does not require a pre-specified number of clusters. However, in some applications we want a partition of disjoint clusters just as in flat clustering. So you can use a cutting line:
max_d = 3
clusters = fcluster(Z, max_d, criterion='distance')

# Also, you can determine the number of clusters directly:
# k = 5
# clusters = fcluster(Z, k, criterion='maxclust'

##### Step3: Plot the dendrogram #####
fig = pylab.figure(figsize=(18, 50))
def llf(id):
	return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id], int(float(pdf['type'][id])))

dendro = hierarchy.dendrogram(Z, leaf_label_func=llf, leaf_rotation=0, leaf_font_size=12, orientation='right')



##### Step2-2: Clustering using scikit-learn #####
# linkage criteria in sklearn package:
# ward: minimizes the sum of squared differences within all clusters. It is a variacne-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hirarchical approach.
# maximum or complete: minimizes the maximum distance between abservations of pairs of clusters.
# average: minimizes the average of the distances between all observations of pairs of clusters.

dist_matrix = distance_matrix(feature_mtx, feature_mtx)
agglom = AgglomerativeClustering(n_clusters=6, linkage='complete')
agglom.fit(feature_mtx)
df['cluster_'] = agglom.labels_ # Add a new field to our dataframe to show the cluster of each row

##### Step3-2: Visualization #####
import matplotlib.cm as cm
n_clusters = max(agglom.labels_) + 1
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
cluster_labels = list(range(0, n_clusters))

plt.figure()
for color, label in zip(colors, cluster_labels):
	subset = df[df.cluster_ == label]
	for i in subset.index:
		plt.text(subset.X1[i], subset.X2[i], str(subset['X3'][i]), rotation=25)
	plt.scatter(subset.X1, subset.X2, s=subset['X4'] c=color, label='cluster'+str(label),alpha=0.5)
plt.legend()
plt.title('Clusters')
plt.xlabel('X1')
plt.ylabel('X2')