demonstration using random points
Download the util script file here: utils.py
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point #, Polygon
from sklearn.neighbors import KernelDensity # for KDE calculation (week 6)
from sklearn.cluster import KMeans # for K-means clustering calculation (week 7)
from sklearn.metrics.pairwise import pairwise_distances_argmin # [k-means] for finding which center each point is following (belongs to)
from sklearn.cluster import DBSCAN # for dbscan calculation (week 7)
from scipy import spatial # for the kd-tree (see week 5): https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html
from sklearn.datasets import make_blobs # for creating random demo clusters
from scipy.spatial.distance import cdist # distortion calculation
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_scalebar.scalebar import ScaleBar # https://pypi.org/project/matplotlib-scalebar/
# custom script of functions
from utils import gen_grid # for the grid for calculating kde
from utils import run_kmeans, run_voronoi, run_std_ellipse, run_convex_hull # for k-meanscreate clustered points (random)¶
## random_state is the random seed
centers = [[4, 4], [2, 2], [4, 2], [1.5, 4]]
pts, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=1
)
plt.scatter(pts[:, 0], pts[:, 1])# see how the data looks like
print(pts.shape)
pts # two columns of coordinates, first x, second y
pts_df = pd.DataFrame.from_dict({
'id': list(range(750)),
'x': pts[:,0],
'y': pts[:,1],
'geometry': [Point(x,y) for x,y in zip(pts[:,0], pts[:,1])]
})
pts_gdf = gpd.GeoDataFrame(pts_df, geometry=pts_df['geometry'], crs=None)
pts_gdf.plot()generate grid for KDE (week 6)¶
xmin = min(pts[:,0])
xmax = max(pts[:,0])
ymin = min(pts[:,1])
ymax = max(pts[:,1])
xlim = [xmin, xmax]
ylim = [ymin, ymax]xmin, ymin, xmax, ymaxsize = .1
len_x = xmax - xmin
len_y = ymax - ymin
print(len_x, len_y)
ncol = int(np.ceil(len_x/size))
nrow = int(np.ceil(len_y/size))
print(ncol, nrow, ncol*nrow)
grid = gen_grid(xmin, ymin, xmax, ymax, ncol=ncol, nrow=nrow, force_square=True, crs=None)
cent = grid.centroid
grid['x'] = cent.x
grid['y'] = cent.y
grid.plot(fc='none', ec='k')grid_data = grid[['x', 'y']]
grid_data = np.array(grid_data)
grid_data, grid_data.shapecalculate KDE and plotting (week 6)¶
demo_kde = KernelDensity(kernel='exponential', bandwidth=.4).fit(pts)
grid['dens'] = np.exp(demo_kde.score_samples(grid_data))
demo_kde
fig, ax = plt.subplots(figsize=(5,5))
grid.plot('dens', cmap='Greys', ax=ax)
#ax.scatter(pts[:, 0], pts[:, 1], marker='.', s=1, c='r')
pts_gdf.plot(marker='.', markersize=2, c='r', ax=ax)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()run k-means¶
k_means = KMeans(init="k-means++", n_clusters=5, n_init=10)
k_means.fit(pts)
k_meansk_means_cluster_centers = k_means.cluster_centers_
k_means_cluster_centersk_means_labels = pairwise_distances_argmin(pts, k_means_cluster_centers)
k_means_labels, k_means_labels.shapepts_gdf['kmeans_label'] = k_means_labels
pts_gdf.head()
fig, ax = plt.subplots(figsize=(5,5))
#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)
pts_gdf.plot('kmeans_label', marker='.', markersize=10, ax=ax, cmap='Set1')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()clrs = sns.color_palette('muted')
clrs
clrs_dict = {k: clrs[k] for k in range(4)} # list comprehension for dictionary
pts_gdf['kmeans_color'] = pts_gdf['kmeans_label'].map(clrs_dict)
clrs_dict, pts_gdf
fig, ax = plt.subplots(figsize=(5,5))
#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)
pts_gdf.plot(color=pts_gdf['kmeans_color'], marker='.', markersize=10, ax=ax)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()generating Voronoi polygon using the custom function¶
centers = pd.DataFrame(k_means_cluster_centers, columns=['x', 'y'])
centers['k'] = list(range(len(k_means_cluster_centers)))
centers = centers[['k', 'x', 'y']]
centers = gpd.GeoDataFrame(centers,
geometry=[Point(x, y) for i, (x, y) in centers[['x', 'y']].iterrows()],
crs=None)
centersclrs = sns.color_palette('pastel')
clrs
vor_gdf = run_voronoi(centers, xmin, ymin, xmax, ymax)
fig, ax = plt.subplots(figsize=(5,5))
vor_gdf.plot(fc=[clrs[k] for k in vor_gdf['k']],
alpha=0.25, ec='k',
zorder=2, ax=ax)
pts_gdf.plot(color=pts_gdf['kmeans_color'], marker='.', markersize=10, ax=ax)
centers.plot(ax=ax, marker='*', markersize=60, c='k')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()k-means: selecting k using elbow method¶
K = range(1, 16)
list(K)random_state = 42 # random seed
# Initialize lists to store distortion and inertia values
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
# Fit K-means for different values of k
for k in K:
#kmeanModel = KMeans(n_clusters=k, random_state=42).fit(X)
k_means = KMeans(init="k-means++", n_clusters=k, n_init=10, random_state=random_state)
k_means.fit(pts)
# Calculate distortion as the average squared distance from points to their cluster centers
distortions.append(sum(np.min(cdist(pts, k_means.cluster_centers_, 'euclidean'), axis=1)**2) / pts.shape[0])
# Inertia is calculated directly by KMeans
inertias.append(k_means.inertia_)
res = pd.DataFrame.from_dict({'k': K, 'distortion': distortions, 'inertia': inertias})
res
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
ax0, ax1 = axs
ax0.plot(K, res['distortion'], 'bx-')
ax0.set_xlabel('Number of Clusters (k)')
ax0.set_ylabel('Distortion')
ax0.set_title('The Elbow Method using Distortion')
ax1.plot(K, res['inertia'], 'bx-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('The Elbow Method using Inertia')
for ax in axs:
ax.axvline(x=4, ls='--', c='grey')
plt.tight_layout()
for DBSCAN¶
pts_gdf
db_a = DBSCAN(eps=0.3, min_samples=10).fit(pts)
db_adb_a.labels_pts_gdf['dbscan_label'] = db_a.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_a.labels_)) - (1 if -1 in db_a.labels_ else 0)
n_noise_ = list(db_a.labels_).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)clrs = sns.color_palette('muted')
clrs_dict = {k: clrs[k] for k in range(4)}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color'] = pts_gdf['dbscan_label'].map(clrs_dict)
clrs_dict
fig, ax = plt.subplots(figsize=(5,5))
#grid.plot('dens', cmap='Greys', ax=ax, alpha=.3)
pts_gdf.plot(color=pts_gdf['dbscan_color'], marker='.', markersize=10, ax=ax,)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()pts_gdf
db_b = DBSCAN(eps=0.3, min_samples=5).fit(pts)
pts_gdf['dbscan_label_b'] = db_b.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_b.labels_)) - (1 if -1 in db_b.labels_ else 0)
n_noise_ = list(db_b.labels_).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
clrs_dict = {k: clrs[k] for k in range(len(pts_gdf['dbscan_label_b'].unique()))}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_b'] = pts_gdf['dbscan_label_b'].map(clrs_dict)
fig, ax = plt.subplots(figsize=(5,5))
pts_gdf.plot(color=pts_gdf['dbscan_color_b'], marker='.', markersize=10, ax=ax,)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
db_c = DBSCAN(eps=0.15, min_samples=5).fit(pts)
pts_gdf['dbscan_label_c'] = db_c.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_c.labels_)) - (1 if -1 in db_c.labels_ else 0)
n_noise_ = list(db_c.labels_).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
clrs_dict = {k: clrs[k] for k in range(len(pts_gdf['dbscan_label_c'].unique()))}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_c'] = pts_gdf['dbscan_label_c'].map(clrs_dict)
fig, ax = plt.subplots(figsize=(5,5))
pts_gdf.plot(color=pts_gdf['dbscan_color_c'], marker='.', markersize=10, ax=ax,)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()searching for the epsilon (distance)¶
using the k-nearest neighbor distance distribution
ptstree = spatial.KDTree(pts)min_samples = 4
dd, ii = tree.query(pts, k=min_samples)
dd, iidistance_to_min_samples = [d[-1] for d in dd]
distance_to_min_samples = sorted(distance_to_min_samples)
fig, ax = plt.subplots(figsize=(6,4))
ax.plot(list(range(len(distance_to_min_samples))), distance_to_min_samples)
minx = -5
dis = []
for rank in [600, 650, 700]:
h = distance_to_min_samples[rank-1]
ax.plot([minx, rank-1, rank-1], [h, h, 0], c='grey', ls='--')
ax.text(10, h+.001, '{:.3f}'.format(h), ha='left', va='bottom')
dis.append(h)
ax.set_xlabel('Rank (sort by min_pts distance)')
ax.set_ylabel('Distance')
ax.set_xlim([minx, 800])
ax.set_ylim([0, 0.7])
plt.tight_layout()fig, ax = plt.subplots(figsize=(6,4))
sns.kdeplot(distance_to_min_samples)
for h in dis:
ax.axvline(x=h, ls='--', c='grey')
plt.tight_layout()total = len(distance_to_min_samples)
for d in dis:
tmp = [v for v in distance_to_min_samples if v<=d]
print('epsilon:', round(d, 3), ', non-core nodes percent:', round((total-len(tmp)) / total, 3))
# non-core = border + noise
db_c = DBSCAN(eps=dis[-1], min_samples=4).fit(pts)
pts_gdf['dbscan_label_c'] = db_c.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_c.labels_)) - (1 if -1 in db_c.labels_ else 0)
n_noise_ = list(db_c.labels_).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
clrs_dict = {k: clrs[k] for k in sorted(range(len(pts_gdf['dbscan_label_c'].unique()))) if k!=-1}
clrs_dict.update({-1: 'k'})
pts_gdf['dbscan_color_c'] = pts_gdf['dbscan_label_c'].map(clrs_dict)
fig, ax = plt.subplots(figsize=(5,5))
tmp1 = pts_gdf[pts_gdf['dbscan_label_c']!=-1]
tmp1.plot(color=tmp1['dbscan_color_c'], marker='.', markersize=10, ax=ax,)
tmp2 = pts_gdf[pts_gdf['dbscan_label_c']==-1]
tmp2.plot(color=tmp2['dbscan_color_c'], marker='x', markersize=5, ax=ax,)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()25/750 # noise only