Skip to article frontmatterSkip to article content

demonstration using random points

Download the util script file here: utils.py

import numpy as np
import pandas as pd 
import geopandas as gpd 
from shapely.geometry import Point  #, Polygon
from sklearn.neighbors import KernelDensity  # for KDE calculation (week 6)

from sklearn.cluster import KMeans  # for K-means clustering calculation (week 7)
from sklearn.metrics.pairwise import pairwise_distances_argmin  # [k-means] for finding which center each point is following (belongs to)
from sklearn.cluster import DBSCAN  # for dbscan calculation (week 7)
from scipy import spatial  # for the kd-tree (see week 5): https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html 

from sklearn.datasets import make_blobs  # for creating random demo clusters

from scipy.spatial.distance import cdist  # distortion calculation

# for plotting
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib_scalebar.scalebar import ScaleBar  # https://pypi.org/project/matplotlib-scalebar/

# custom script of functions
from utils import gen_grid  # for the grid for calculating kde
from utils import run_kmeans, run_voronoi, run_std_ellipse, run_convex_hull  # for k-means

create clustered points (random)

## random_state is the random seed

centers = [[4, 4], [2, 2], [4, 2], [1.5, 4]]
pts, labels_true = make_blobs(
    n_samples=750, centers=centers, cluster_std=0.4, random_state=1
)
plt.scatter(pts[:, 0], pts[:, 1])
# see how the data looks like
print(pts.shape)
pts  # two columns of coordinates, first x, second y
pts_df = pd.DataFrame.from_dict({
    'id': list(range(750)), 
    'x': pts[:,0], 
    'y': pts[:,1],
    'geometry': [Point(x,y) for x,y in zip(pts[:,0], pts[:,1])]
})

pts_gdf = gpd.GeoDataFrame(pts_df, geometry=pts_df['geometry'], crs=None)
pts_gdf.plot()

generate grid for KDE (week 6)

xmin = min(pts[:,0])
xmax = max(pts[:,0])
ymin = min(pts[:,1])
ymax = max(pts[:,1])

xlim = [xmin, xmax]
ylim = [ymin, ymax]
xmin, ymin, xmax, ymax
size = .1
len_x = xmax - xmin
len_y = ymax - ymin

print(len_x, len_y)

ncol = int(np.ceil(len_x/size))
nrow = int(np.ceil(len_y/size))
print(ncol, nrow, ncol*nrow)

grid = gen_grid(xmin, ymin, xmax, ymax, ncol=ncol, nrow=nrow, force_square=True, crs=None)
cent = grid.centroid
grid['x'] = cent.x
grid['y'] = cent.y
grid.plot(fc='none', ec='k')
grid_data = grid[['x', 'y']]
grid_data = np.array(grid_data)
grid_data, grid_data.shape

calculate KDE and plotting (week 6)

demo_kde = KernelDensity(kernel='exponential', bandwidth=.4).fit(pts)
grid['dens'] = np.exp(demo_kde.score_samples(grid_data))

demo_kde

fig, ax = plt.subplots(figsize=(5,5))

grid.plot('dens', cmap='Greys', ax=ax)

#ax.scatter(pts[:, 0], pts[:, 1], marker='.', s=1, c='r')
pts_gdf.plot(marker='.', markersize=2, c='r', ax=ax)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

run k-means

k_means = KMeans(init="k-means++", n_clusters=5, n_init=10)
k_means.fit(pts)
k_means
k_means_cluster_centers = k_means.cluster_centers_
k_means_cluster_centers
k_means_labels = pairwise_distances_argmin(pts, k_means_cluster_centers)
k_means_labels, k_means_labels.shape
pts_gdf['kmeans_label'] = k_means_labels

pts_gdf.head()

fig, ax = plt.subplots(figsize=(5,5))

#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)

pts_gdf.plot('kmeans_label', marker='.', markersize=10, ax=ax, cmap='Set1')

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
clrs = sns.color_palette('muted')
clrs

clrs_dict = {k: clrs[k] for k in range(4)}  # list comprehension for dictionary

pts_gdf['kmeans_color'] = pts_gdf['kmeans_label'].map(clrs_dict)

clrs_dict, pts_gdf

fig, ax = plt.subplots(figsize=(5,5))

#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)

pts_gdf.plot(color=pts_gdf['kmeans_color'], marker='.', markersize=10, ax=ax)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

generating Voronoi polygon using the custom function


centers = pd.DataFrame(k_means_cluster_centers, columns=['x', 'y'])
centers['k'] = list(range(len(k_means_cluster_centers)))
centers = centers[['k', 'x', 'y']]
centers = gpd.GeoDataFrame(centers, 
                           geometry=[Point(x, y) for i, (x, y) in centers[['x', 'y']].iterrows()], 
                           crs=None)
centers
clrs = sns.color_palette('pastel')
clrs

vor_gdf = run_voronoi(centers, xmin, ymin, xmax, ymax)

fig, ax = plt.subplots(figsize=(5,5))

vor_gdf.plot(fc=[clrs[k] for k in vor_gdf['k']], 
             alpha=0.25, ec='k',
             zorder=2, ax=ax)

pts_gdf.plot(color=pts_gdf['kmeans_color'], marker='.', markersize=10, ax=ax)

centers.plot(ax=ax, marker='*', markersize=60, c='k')

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

k-means: selecting k using elbow method

K = range(1, 16)
list(K)
random_state = 42  # random seed


# Initialize lists to store distortion and inertia values
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}

# Fit K-means for different values of k
for k in K:
    #kmeanModel = KMeans(n_clusters=k, random_state=42).fit(X)
    k_means = KMeans(init="k-means++", n_clusters=k, n_init=10, random_state=random_state)
    k_means.fit(pts)
    
    # Calculate distortion as the average squared distance from points to their cluster centers
    distortions.append(sum(np.min(cdist(pts, k_means.cluster_centers_, 'euclidean'), axis=1)**2) / pts.shape[0])
    
    # Inertia is calculated directly by KMeans
    inertias.append(k_means.inertia_)

res = pd.DataFrame.from_dict({'k': K, 'distortion': distortions, 'inertia': inertias})
res

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
ax0, ax1 = axs
ax0.plot(K, res['distortion'], 'bx-')
ax0.set_xlabel('Number of Clusters (k)')
ax0.set_ylabel('Distortion')
ax0.set_title('The Elbow Method using Distortion')

ax1.plot(K, res['inertia'], 'bx-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('The Elbow Method using Inertia')

for ax in axs:
    ax.axvline(x=4, ls='--', c='grey')
plt.tight_layout()

for DBSCAN

pts_gdf

db_a = DBSCAN(eps=0.3, min_samples=10).fit(pts)
db_a
db_a.labels_
pts_gdf['dbscan_label'] = db_a.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_a.labels_)) - (1 if -1 in db_a.labels_ else 0)
n_noise_ = list(db_a.labels_).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
clrs = sns.color_palette('muted')
clrs_dict = {k: clrs[k] for k in range(4)}
clrs_dict.update({-1: 'k'})

pts_gdf['dbscan_color'] = pts_gdf['dbscan_label'].map(clrs_dict)

clrs_dict

fig, ax = plt.subplots(figsize=(5,5))

#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)

pts_gdf.plot(color=pts_gdf['dbscan_color'], marker='.', markersize=10, ax=ax,)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
pts_gdf

db_b = DBSCAN(eps=0.3, min_samples=5).fit(pts)
pts_gdf['dbscan_label_b'] = db_b.labels_


# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_b.labels_)) - (1 if -1 in db_b.labels_ else 0)
n_noise_ = list(db_b.labels_).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

clrs_dict = {k: clrs[k] for k in range(len(pts_gdf['dbscan_label_b'].unique()))}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_b'] = pts_gdf['dbscan_label_b'].map(clrs_dict)


fig, ax = plt.subplots(figsize=(5,5))

pts_gdf.plot(color=pts_gdf['dbscan_color_b'], marker='.', markersize=10, ax=ax,)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

db_c = DBSCAN(eps=0.15, min_samples=5).fit(pts)
pts_gdf['dbscan_label_c'] = db_c.labels_


# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_c.labels_)) - (1 if -1 in db_c.labels_ else 0)
n_noise_ = list(db_c.labels_).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

clrs_dict = {k: clrs[k] for k in range(len(pts_gdf['dbscan_label_c'].unique()))}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_c'] = pts_gdf['dbscan_label_c'].map(clrs_dict)


fig, ax = plt.subplots(figsize=(5,5))

pts_gdf.plot(color=pts_gdf['dbscan_color_c'], marker='.', markersize=10, ax=ax,)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

searching for the epsilon (distance)

using the k-nearest neighbor distance distribution

pts
tree = spatial.KDTree(pts)
min_samples = 4
dd, ii = tree.query(pts, k=min_samples)
dd, ii
distance_to_min_samples = [d[-1] for d in dd]
distance_to_min_samples = sorted(distance_to_min_samples)

fig, ax = plt.subplots(figsize=(6,4))

ax.plot(list(range(len(distance_to_min_samples))), distance_to_min_samples)

minx = -5
dis = []
for rank in [600, 650, 700]:
    h = distance_to_min_samples[rank-1]
    ax.plot([minx, rank-1, rank-1], [h, h, 0], c='grey', ls='--')
    ax.text(10, h+.001, '{:.3f}'.format(h), ha='left', va='bottom')
    dis.append(h)

ax.set_xlabel('Rank (sort by min_pts distance)')
ax.set_ylabel('Distance')
ax.set_xlim([minx, 800])
ax.set_ylim([0, 0.7])
plt.tight_layout()
fig, ax = plt.subplots(figsize=(6,4))

sns.kdeplot(distance_to_min_samples)
for h in dis:
    ax.axvline(x=h, ls='--', c='grey')
plt.tight_layout()
total = len(distance_to_min_samples)
for d in dis:
    tmp = [v for v in distance_to_min_samples if v<=d]
    print('epsilon:', round(d, 3), ', non-core nodes percent:', round((total-len(tmp)) / total, 3))

# non-core = border + noise

db_c = DBSCAN(eps=dis[-1], min_samples=4).fit(pts)
pts_gdf['dbscan_label_c'] = db_c.labels_


# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_c.labels_)) - (1 if -1 in db_c.labels_ else 0)
n_noise_ = list(db_c.labels_).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

clrs_dict = {k: clrs[k] for k in sorted(range(len(pts_gdf['dbscan_label_c'].unique()))) if k!=-1}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_c'] = pts_gdf['dbscan_label_c'].map(clrs_dict)


fig, ax = plt.subplots(figsize=(5,5))

tmp1 = pts_gdf[pts_gdf['dbscan_label_c']!=-1]
tmp1.plot(color=tmp1['dbscan_color_c'], marker='.', markersize=10, ax=ax,)

tmp2 = pts_gdf[pts_gdf['dbscan_label_c']==-1]
tmp2.plot(color=tmp2['dbscan_color_c'], marker='x', markersize=5, ax=ax,)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
25/750  # noise only