Lab 7b: Spatial Clusters Detection (real data) - Geospatial Statistics and Visualisation

Download the util script file here: utils.py

import os 
from collections import defaultdict
import numpy as np
import pandas as pd 
import geopandas as gpd 
from shapely.geometry import Point, Polygon
from sklearn.neighbors import KernelDensity  # for calculating KDE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.cluster import DBSCAN
from scipy import spatial  # for the kd-tree (see week 5): https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html 

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib_scalebar.scalebar import ScaleBar  # https://pypi.org/project/matplotlib-scalebar/

from utils import gen_grid, run_kde, run_kmeans, run_voronoi, run_std_ellipse, run_convex_hull

os.listdir('../resources')  ## check your own folder location and change the path accordiningly

load background data

subz = gpd.read_file('../resources/master-plan-2019-subzone_landuse_area.zip')  # point to your own file location
print(subz.crs)
subz.plot()

load point data¶

QT = pd.read_csv('../resources/citydata-Queenstown-20200915.csv.xz', index_col=0)
QT.head()

QT = gpd.GeoDataFrame(QT, geometry=[Point(x, y) for i, (x, y) in QT[['xx', 'yy']].iterrows()], crs=subz.crs)
QT.plot()


hour_group = {
    0: 'midnight',  # 00-04
    1: 'morning commuting',  # 04-08
    2: 'before lunch',  # 08-12
    3: 'after lunch',  # 12-16
    4: 'evening commuting',  # 16-20
    5: 'night',  # 20-24
}

prepare grid¶

xmin, ymin, xmax, ymax = QT.total_bounds
xlim = [xmin, xmax]
ylim = [ymin, ymax]
xmin, ymin, xmax, ymax

crs = QT.crs

size = 100
len_x = xmax - xmin
len_y = ymax - ymin

print(len_x, len_y)

ncol = int(np.ceil(len_x/size))
nrow = int(np.ceil(len_y/size))
print(ncol, nrow, ncol*nrow)

grid = gen_grid(xmin, ymin, xmax, ymax, ncol=ncol, nrow=nrow, force_square=True, crs=crs)
cent = grid.centroid
grid['x'] = cent.x
grid['y'] = cent.y

print(grid.head())
grid.plot(fc='none', ec='k')

let’s start with KDE (for all)¶

grid_data = grid[['x', 'y']]
grid_data = np.array(grid_data)
grid_data, grid_data.shape

print(QT.head())
all_pts = QT[['xx', 'yy']]

kde_all = KernelDensity(kernel='exponential', bandwidth=400).fit(all_pts)
grid['all_dens'] = np.exp(kde_all.score_samples(grid_data))

kde_all


fig, ax = plt.subplots(figsize=(5,5))

grid.plot('all_dens', cmap='Blues', ax=ax)
subz.plot(fc='none', ec='k', ax=ax)

# Create scale bar
scalebar = ScaleBar(1, "m", fixed_value=500, fixed_units='m')
ax.add_artist(scalebar)

ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()

here run for the four hour groups¶

1: 'morning commuting',  # 04-08
2: 'before lunch',  # 08-12
4: 'evening commuting',  # 16-20
5: 'night',  # 20-24

this_pts = QT

new_cols = []
for i, hg in enumerate([1, 2, 4, 5]):
    tmp = this_pts[this_pts['hour_group']==hg]
    #c = [clrs[3]]*len(tmp)
    zs = run_kde(tmp, grid_data, bandwidth=400)  ## <---- go to script
    hg_col = 'hg_{}_dens'.format(hg)
    grid[hg_col] = zs  # store the KDE result to grid
    new_cols.append(hg_col)



title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()

vmax = grid[new_cols].max().max()

for i, hg in enumerate([1, 2, 4, 5]):
    ax = axs[i]
    tmp = this_pts[this_pts['hour_group']==hg]
    hg_col = 'hg_{}_dens'.format(hg)
    grid.plot(hg_col, cmap='Greys', vmin=0, vmax=vmax, ax=ax)
    tmp.plot(c='r', ax=ax, markersize=1, alpha=.05)
    subz.plot(fc='none', ec='k', ax=ax)

    ax.set_title(title_dict[hg])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    #break

plt.tight_layout()

k-means clustering¶

try visualise with Convex Hull¶

fg_clrs = sns.color_palette('muted')
bg_clrs = sns.color_palette('pastel')


n_clusters = 6
this_pts = QT

x0, y0, x1, y1 = this_pts.total_bounds
xlim = (x0, x1)
ylim = (y0, y1)


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()  # <---here it is a copy, separated from the original DF, 
    # meaning it is a local temporary instance and it WILL DISAPPEAR after every loop (only the last instance (hg=5) will be kept after loop) 
    #Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    k_means_labels, centers_gdf = run_kmeans(this_gdf, n_clusters)  # look at the script to see how it is run
    this_gdf['k'] = k_means_labels
    #print(this_gdf)
    
    hull_gdf = run_convex_hull(this_gdf, 'k')  # generate convex hull

    # plotting
    # plot the convex hull as backhgound
    hull_gdf.plot(fc=[bg_clrs[k] for k in hull_gdf['k']], 
                 alpha=0.25, ec='k',
                 zorder=2, ax=ax)
    #print(hull_gdf)

    # plot the points
    this_gdf.plot(color=[fg_clrs[k] for k in k_means_labels], 
                  marker=".", alpha=.25, markersize=10, 
                  zorder=5, ax=ax, )
    
    # plot the k-means centers
    centers_gdf.plot(color=[fg_clrs[k] for k in centers_gdf['k']],  
                     marker='*',
                     ax=ax,
                     zorder=6,
                     markersize=90,
                    )
    
    subz.plot(fc='none', ec='k', ax=ax, alpha=.1)
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])
    ## nothing of the result is save; if you need to save them, try using some list/dictionary to keep the results

plt.tight_layout()

same thing, but add the KDE background and remove points¶


n_clusters = 6
this_pts = QT

x0, y0, x1, y1 = this_pts.total_bounds
xlim = (x0, x1)
ylim = (y0, y1)


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    #Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    k_means_labels, centers_gdf = run_kmeans(this_gdf, n_clusters)
    this_gdf['k'] = k_means_labels
    #print(this_gdf)
    hg_col = 'hg_{}_dens'.format(hg)
    grid.plot(hg_col, cmap='Blues', vmin=0, vmax=vmax, ax=ax)
    
    hull_gdf = run_convex_hull(this_gdf, 'k')
    
    hull_gdf.plot(fc=[bg_clrs[k] for k in hull_gdf['k']], 
                 alpha=0.125, ec='k',
                 zorder=2, ax=ax)
    #print(hull_gdf)
    
    """this_gdf.plot(color=[fg_clrs[k] for k in k_means_labels], 
                  marker=".", alpha=.25, markersize=10, 
                  zorder=5, ax=ax, )
    """
    
    centers_gdf.plot(color=[fg_clrs[k] for k in centers_gdf['k']],  
                     marker='*',
                     ax=ax,
                     zorder=6,
                     markersize=90,
                    )
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])

plt.tight_layout()

use standard ellipses instead of convex hull¶


n_clusters = 6
this_pts = QT

x0, y0, x1, y1 = this_pts.total_bounds
xlim = (x0, x1)
ylim = (y0, y1)


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    #Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    k_means_labels, centers_gdf = run_kmeans(this_gdf, n_clusters)
    this_gdf['k'] = k_means_labels
    #print(this_gdf)
    hg_col = 'hg_{}_dens'.format(hg)
    grid.plot(hg_col, cmap='Blues', vmin=0, vmax=vmax, ax=ax)
    
    elps_gdf = run_std_ellipse(this_gdf, 'k', n_std=2)
    elps_gdf.plot(fc=[bg_clrs[k] for k in elps_gdf['k']], 
                 alpha=0.125, ec='k',
                 zorder=2, ax=ax)
    #print(hull_gdf)
    
    """this_gdf.plot(color=[fg_clrs[k] for k in k_means_labels], 
                  marker=".", alpha=.25, markersize=10, 
                  zorder=5, ax=ax, )
    """
    
    centers_gdf.plot(color=[fg_clrs[k] for k in centers_gdf['k']],  
                     marker='*',
                     ax=ax,
                     zorder=6,
                     markersize=90,
                    )
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])

plt.tight_layout()

### replace the standard ellipses/convex hulls with Voronoi Polygon


n_clusters = 6
this_pts = QT

x0, y0, x1, y1 = this_pts.total_bounds
xlim = (x0, x1)
ylim = (y0, y1)


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    #Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    k_means_labels, centers_gdf = run_kmeans(this_gdf, n_clusters)
    this_gdf['k'] = k_means_labels
    #print(this_gdf)
    hg_col = 'hg_{}_dens'.format(hg)
    grid.plot(hg_col, cmap='Blues', vmin=0, vmax=vmax, ax=ax)
    
    vor_gdf = run_voronoi(centers_gdf, x0, y0, x1, y1)
    
    vor_gdf.plot(fc=[bg_clrs[k] for k in vor_gdf['k']], 
                 alpha=0.125, ec='k',
                 zorder=2, ax=ax)

    #print(hull_gdf)
    
    """this_gdf.plot(color=[fg_clrs[k] for k in k_means_labels], 
                  marker=".", alpha=.25, markersize=10, 
                  zorder=5, ax=ax, )
    """
    
    centers_gdf.plot(color=[fg_clrs[k] for k in centers_gdf['k']],  
                     marker='*',
                     ax=ax,
                     zorder=6,
                     markersize=90,
                    )
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])

plt.tight_layout()

calculating distortion and inertia for determining k

random_state = 42
K = range(1, 16)

evaluation_res = {}

for i, hg in enumerate([1, 2, 4, 5]):
    this_gdf = this_pts[this_pts['hour_group']==hg]
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T

    # Initialize lists to store distortion and inertia values
    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}
    
    # Fit K-means for different values of k
    for k in K:
        #kmeanModel = KMeans(n_clusters=k, random_state=42).fit(X)
        k_means = KMeans(init="k-means++", n_clusters=k, n_init=10, random_state=random_state)
        k_means.fit(Xb)
        
        # Calculate distortion as the average squared distance from points to their cluster centers
        distortions.append(sum(np.min(cdist(Xb, k_means.cluster_centers_, 'euclidean'), axis=1)**2) / Xb.shape[0])
        
        # Inertia is calculated directly by KMeans
        inertias.append(k_means.inertia_)

    res = pd.DataFrame.from_dict({'k': K, 'distortion': distortions, 'inertia': inertias})
    evaluation_res[hg] = res

# plot the result
title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

fig, axg = plt.subplots(4, 2, figsize=(10, 20))

for i, hg in enumerate([1, 2, 4, 5]):
    axs = axg[i]
    ax0, ax1 = axs
    
    res = evaluation_res[hg]
    ax0.plot(res['k'], res['distortion'], 'bx-')
    ax0.set_xlabel('Number of Clusters (k)')
    ax0.set_ylabel('Distortion')
    ax0.set_title('Distortion for time: {}'.format(title_dict[hg]))
    
    ax1.plot(res['k'], res['inertia'], 'bx-')
    ax1.set_xlabel('Number of Clusters (k)')
    ax1.set_ylabel('Inertia')
    ax1.set_title('Inertia for time: {}'.format(title_dict[hg]))
    
    for ax in axs:
        ax.axvline(x=6, ls='--', c='grey')
plt.tight_layout()
#fig.savefig('kmeans_distrotion_inertia.png', dpi=150, bbox_inches='tight')

seems like 6 is a reasonable k

For more analysis on Silhouette Score, please refer to the following page: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

DBSCAN (using all points)¶

Xb = np.array(QT[['xx', 'yy']])
Xb

db_a = DBSCAN(eps=100, min_samples=50).fit(Xb)
db_a

labels = db_a.labels_

labels


# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

QT['dbscan_all'] = labels

QT.groupby(['dbscan_all'])[['did']].count()

QT.plot('dbscan_all', marker='.', markersize=2, cmap='tab20')

run DBSCAN for different hour groups¶

clrs = sns.color_palette('tab20')
clrs


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    print('Working on hour group: {}, for {}'.format(hg, title_dict[hg]))
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    
    db_tmp = DBSCAN(eps=200, min_samples=50).fit(Xb)
    labels = db_tmp.labels_
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)
    
    this_gdf['dbscan_cluster'] = labels
    tmp_count = this_gdf.groupby(['dbscan_cluster'])[['did']].count().sort_values(by='did', ascending=False)
    print(tmp_count.head(10))
    
    #print(this_gdf)
    tmp = this_gdf[this_gdf['dbscan_cluster']!=-1]
    tmp.plot(color=[clrs[k%20] for k in labels], 
             marker=".", alpha=.25, markersize=10, 
             zorder=5, ax=ax, )
    tmp2 = this_gdf[this_gdf['dbscan_cluster']==-1]
    tmp2.plot(color='grey', 
              marker="x", alpha=.125, markersize=10, 
              zorder=5, ax=ax, )
    
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])
    print()
plt.tight_layout()

min_samples = 50

sorted_distance_list = []

for i, hg in enumerate([1, 2, 4, 5]):
    print('Working on hour group: {}, for {}'.format(hg, title_dict[hg]))
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    
    this_tree = spatial.KDTree(Xb)
    dd, ii = this_tree.query(Xb, k=min_samples)

    distance_to_min_samples = [d[-1] for d in dd]
    distance_to_min_samples = sorted(distance_to_min_samples)
    sorted_distance_list.append(distance_to_min_samples)


fig, axg = plt.subplots(2, 2, figsize=(12, 8))
axs = axg.flatten()

minx = -100
#maxx = max([len(a) for a in sorted_distance_list]) + 50
eps_dict = {}
for i, (hg, distance_to_min_samples) in enumerate(zip([1, 2, 4, 5], sorted_distance_list)):
    ax = axs[i]
    ax.plot(list(range(len(distance_to_min_samples))), distance_to_min_samples)

    maxx = len(distance_to_min_samples) + 100
    dis = {}
    for pc in [.8, .9, .95, .99]:
        ninety_pc = int(round(len(distance_to_min_samples) * pc))
        print(ninety_pc)
        h = distance_to_min_samples[ninety_pc-1]
        ax.plot([minx, ninety_pc-1, ninety_pc-1], [h, h, 0], c='grey', ls='--')
        ax.text(10, h+.001, '{:.3f}'.format(h), ha='left', va='bottom')
        ax.text(ninety_pc, 0, pc, ha='center', va='top')
        dis[pc] = h
    eps_dict[hg] = dis
    ax.set_xlabel('Rank (sort by min_pts distance)')
    ax.set_ylabel('Distance')
    ax.set_xlim([minx, maxx])
    #ax.set_ylim([0, 0.7])
plt.tight_layout()


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    print('Working on hour group: {}, for {}'.format(hg, title_dict[hg]))
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    eps = eps_dict[hg][.95]
    db_tmp = DBSCAN(eps=eps, min_samples=50).fit(Xb)
    labels = db_tmp.labels_
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)
    
    this_gdf['dbscan_cluster'] = labels
    tmp_count = this_gdf.groupby(['dbscan_cluster'])[['did']].count().sort_values(by='did', ascending=False)
    print(tmp_count.head(10))
    
    #print(this_gdf)
    tmp = this_gdf[this_gdf['dbscan_cluster']!=-1]
    tmp.plot(color=[clrs[k%20] for k in labels], 
             marker=".", alpha=.25, markersize=10, 
             zorder=5, ax=ax, )
    tmp2 = this_gdf[this_gdf['dbscan_cluster']==-1]
    tmp2.plot(color='grey', 
              marker="x", alpha=.125, markersize=10, 
              zorder=5, ax=ax, )
    
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])
    print()
plt.tight_layout()

min_samples_prop = .01 # 1%

sorted_distance_list = []
min_pts_list = {}

for i, hg in enumerate([1, 2, 4, 5]):
    print('Working on hour group: {}, for {}'.format(hg, title_dict[hg]))
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T

    min_samples = int(round(len(this_gdf) * min_samples_prop))
    print(min_samples)
    min_pts_list[hg] = min_samples
    
    this_tree = spatial.KDTree(Xb)
    dd, ii = this_tree.query(Xb, k=min_samples)

    distance_to_min_samples = [d[-1] for d in dd]
    distance_to_min_samples = sorted(distance_to_min_samples)
    sorted_distance_list.append(distance_to_min_samples)


fig, axg = plt.subplots(2, 2, figsize=(12, 8))
axs = axg.flatten()

minx = -100
#maxx = max([len(a) for a in sorted_distance_list]) + 50
eps_dict = {}
for i, (hg, distance_to_min_samples) in enumerate(zip([1, 2, 4, 5], sorted_distance_list)):
    ax = axs[i]
    ax.plot(list(range(len(distance_to_min_samples))), distance_to_min_samples)

    maxx = len(distance_to_min_samples) + 100
    dis = {}
    for pc in [.7, .8, .9, .95, .99]:
        ninety_pc = int(round(len(distance_to_min_samples) * pc))
        print(ninety_pc)
        h = distance_to_min_samples[ninety_pc-1]
        ax.plot([minx, ninety_pc-1, ninety_pc-1], [h, h, 0], c='grey', ls='--')
        ax.text(10, h+.001, '{:.3f}'.format(h), ha='left', va='bottom')
        ax.text(ninety_pc, 0, pc, ha='center', va='top')
        dis[pc] = h
    eps_dict[hg] = dis
    ax.set_xlabel('Rank (sort by min_pts distance)')
    ax.set_ylabel('Distance')
    ax.set_xlim([minx, maxx])
    #ax.set_ylim([0, 0.7])
plt.tight_layout()


fig, axg = plt.subplots(2, 2, figsize=(12, 11), sharex=True, sharey=True)
axs = axg.flatten()


title_dict = {
0: 'midnight',
1: 'morning commuting',
2: 'before lunch',
3: 'after lunch',
4: 'evening commuting',
5: 'night',
}

for i, hg in enumerate([1, 2, 4, 5]):
    print('Working on hour group: {}, for {}'.format(hg, title_dict[hg]))
    ax = axs[i]
    this_gdf = this_pts[this_pts['hour_group']==hg].copy()
    Xb = np.array([this_gdf.geometry.x, this_gdf.geometry.y]).T
    eps = eps_dict[hg][.7]
    min_pts = min_pts_list[hg]
    db_tmp = DBSCAN(eps=eps, min_samples=min_pts).fit(Xb)
    labels = db_tmp.labels_
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)
    
    this_gdf['dbscan_cluster'] = labels
    tmp_count = this_gdf.groupby(['dbscan_cluster'])[['did']].count().sort_values(by='did', ascending=False)
    print(tmp_count.head(10))
    
    #print(this_gdf)
    tmp = this_gdf[this_gdf['dbscan_cluster']!=-1]
    tmp.plot(color=[clrs[k%20] for k in labels], 
             marker=".", alpha=.25, markersize=10, 
             zorder=5, ax=ax, )
    tmp2 = this_gdf[this_gdf['dbscan_cluster']==-1]
    tmp2.plot(color='grey', 
              marker="x", alpha=.125, markersize=10, 
              zorder=5, ax=ax, )
    
    
    ax.set_title(title_dict[hg])
    xlim = (x0, x1)
    ylim = (y0, y1)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticks([])
    ax.set_yticks([])
    print()
plt.tight_layout()

Geospatial Statistics and Visualisation

Lab 7a: Spatial Clusters Detection (demo)

Geospatial Statistics and Visualisation

Lab 8: Spatial Autocorrelation Analysis