Lab 8: Spatial Autocorrelation Analysis - Geospatial Statistics and Visualisation

# import os
import pandas as pd
import geopandas as gpd

# for plotting the spatial relationships
from shapely.geometry import LineString

## the pysal weights and Moran's I calculation
# from pysal.lib import weights
from libpysal import weights
#from libpysal.weights import higher_order
#from pysal.explore import esda  # esda: Exploratory Spatial Data Analysis
import esda
from splot.esda import moran_scatterplot

# Plotting functions
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from tqdm import tqdm

weights.Queen, weights.higher_order

#!pip install pysal

df = pd.read_csv('../resources/ODIO_all_202306.csv.xz', index_col=0)  # <-- new file, train and bus ridership in SG
df.head()

df2 = df[df['Hour_of_day']>=6]  # get morning peak
df2 = df2[df2['Hour_of_day']<9]  # get morning peak, before office hour
len(df2)

df3 = df2.groupby('subz')[['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']].sum().reset_index()
df3

look at the data, all are Poisson-like

cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
    sns.histplot(df3[col])

how about log-transform?

cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
    sns.histplot(np.log(df3[col]))

cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
    df3[col+'_log'] = np.log(df3[col]+1)  # +1 to avoid log(0) NaN problem

subz = gpd.read_file('../resources/MP14_SUBZONE_WEB_PL.zip')  # <-- new file, Master Plan 2014
subz = subz.to_crs('epsg:3414')
subz = subz[['SUBZONE_N', 'PLN_AREA_N', 'REGION_N', 'REGION_C', 'geometry']]
print(subz.head())
subz.plot()

len(subz), len(df3)   # ridership data has less subzones

df_merged = subz.merge(df3, left_on='SUBZONE_N', right_on='subz')
len(df_merged)  # all subzones from ridership dataset successfully joined, some empty subzones are ignored

df_merged.columns

cols8 = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming',
       'Weekend_outgoing', 'Weekday_incoming_log', 'Weekday_outgoing_log',
       'Weekend_incoming_log', 'Weekend_outgoing_log']

for col in cols8:
    df_merged.plot(col, cmap='Greys')  # just a quick mapping

## how to put them together?
# fig, axg = plt.subplots(4, 2, figsize=(10, 12))

df_merged.head()

let’s try standardizing them

def standardize(pd_series):
    #m = pd_series.mean()
    #s = pd_series.std()
    #vs = [(v-m)/s for v in pd_series]
    standardized_series = (pd_series - pd_series.mean()) / pd_series.std()

    return standardized_series

for col in cols8:
    df_merged['z_'+col] = standardize(df_merged[col])
    df_merged.plot('z_'+col, cmap='Greys')


for col in cols:
    sns.histplot(df_merged['z_'+col])


for col in cols:
    sns.histplot(df_merged[col])


for col in cols:
    sns.histplot(df_merged['z_'+col+'_log'])


for col in cols:
    sns.histplot(df_merged[col+'_log'])

the above is how the data looks, now let’s go checking the spatial weight

let’s try to draw some lines to connect the ‘neighborhood’ from each subzone

subz.head()

subz_centers = subz.geometry.centroid 
subz_centers = {k: p for k, p in zip(subz['SUBZONE_N'], subz_centers)}
print(subz_centers['TAMPINES EAST'])

this_w = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N')  # based on queen contiguity

this_lines = []
for k in df_merged['SUBZONE_N']:
    ns = this_w[k]
    p0 = subz_centers[k]
    for n in ns:
        p1 = subz_centers[n]
        this_lines.append({'subz': k, 'neighbor': n, 'geometry': LineString([p0, p1])})
this_lines = pd.DataFrame.from_dict(this_lines)
this_lines = gpd.GeoDataFrame(this_lines, geometry=this_lines['geometry'], crs=subz.crs)

fig, ax = plt.subplots()
df_merged.plot(ax=ax)
this_lines.plot(ax=ax, ec='k')

this_w['NORTH COAST']

len(list(this_w))

for i, k in enumerate(this_w):
    print(k)
    if i>=5:
        break

def gen_lines(this_w):
    # subz and subz_centers are comming from the global variables, which may not work if this function is moved to an individual script
    this_lines = []
    for k in df_merged['SUBZONE_N']:
        ns = this_w[k]
        p0 = subz_centers[k]
        for n in ns:
            p1 = subz_centers[n]
            this_lines.append({'subz': k, 'neighbor': n, 'geometry': LineString([p0, p1])})
    this_lines = pd.DataFrame.from_dict(this_lines)
    this_lines = gpd.GeoDataFrame(this_lines, geometry=this_lines['geometry'], crs=subz.crs)
    return this_lines

w1 = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N')  # queen contiguity
w2 = higher_order(w1, k=2, lower_order=True)  # 2nd level queen contiguity
w3 = weights.KNN.from_dataframe(df_merged, ids='SUBZONE_N', k=10)  # k-nearest neighbors
w4 = weights.DistanceBand.from_dataframe(df_merged, ids='SUBZONE_N', threshold=2000) # distance-based (search radius/buffer)
w5 = weights.Rook.from_dataframe(df_merged, ids='SUBZONE_N')  # rook contiguity
w6 = higher_order(w5, k=2, lower_order=True)  # 2nd level rook contiguity

lines1 = gen_lines(w1)
lines2 = gen_lines(w2)
lines3 = gen_lines(w3)
lines4 = gen_lines(w4)
lines5 = gen_lines(w5)
lines6 = gen_lines(w6)

## some colors from Catppuccin: https://github.com/catppuccin/catppuccin
LAV = '#7287fd'
MAR = '#e64553'
BLK = '#4c4f69'
GRN = '#40a02b'
GRN2 = '#a6e3a1'
WHT = '#dce0e8'
WHT2 = '#eff1f5'


fig, axg = plt.subplots(3, 2, figsize=(12, 12))
axs = axg.flatten()

lines = [lines5, lines6, lines1, lines2, lines3, lines4]
spatial_defs = [
    'Rook (1st level)', 'Rook (1&2 levels)', 
    'Queen (1st level)', 'Queen (1&2 levels)', 
    'k-nearest neighbor (k=10)', 'Distance Band (threshold=2000m)'
]
for i, ln in enumerate(lines):
    ax = axs[i]
    df_merged.plot(ax=ax, fc=LAV, alpha=.9)
    ln.plot(ax=ax, ec=BLK)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title('Neighborhood of {}'.format(spatial_defs[i]))
plt.tight_layout()

back to Moran’s I

w1 = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N')  # queen contiguity
w2 = higher_order(w1, k=2, lower_order=True)  # 2nd level queen contiguity
w3 = weights.KNN.from_dataframe(df_merged, ids='SUBZONE_N', k=10)  # k-nearest neighbors
w4 = weights.DistanceBand.from_dataframe(df_merged, ids='SUBZONE_N', threshold=2000) # distance-based (search radius/buffer)
w5 = weights.Rook.from_dataframe(df_merged, ids='SUBZONE_N')  # rook contiguity
w6 = higher_order(w5, k=2, lower_order=True)  # 2nd level rook contiguity

# let's work with KNN
w =w3
#w.transform = 'R'  # row standardized >> can be done during the calculation of Moran's I

for col in cols8:
    print(col)
    df_merged['lag_'+col] = weights.lag_spatial(w, df_merged[col])
    df_merged['lag_z_'+col] = weights.lag_spatial(w, df_merged['z_'+col])

col = cols8[-1]
sns.regplot(x=col, y='lag_'+col, data=df_merged, ci=None)

#col = cols[0]

# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
    ax = axs[i]
    # Plot values
    sns.regplot(x=col, y='lag_'+col, data=df_merged, ci=None, ax=ax)
    # Add vertical and horizontal lines
    #ax.axvline(0, c='k', alpha=0.5)
    #ax.axhline(0, c='k', alpha=0.5)

#col = cols[0]

# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
    ax = axs[i]
    # Plot values
    sns.regplot(x=col+'_log', y='lag_'+col+'_log', data=df_merged, ci=None, ax=ax)
    # Add vertical and horizontal lines
    #ax.axvline(0, c='k', alpha=0.5)
    #ax.axhline(0, c='k', alpha=0.5)

#col = cols[0]

# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
    ax = axs[i]
    # Plot values
    sns.regplot(x='z_'+col, y='lag_z_'+col, data=df_merged, ci=None, ax=ax, )
    # Add vertical and horizontal lines
    ax.axvline(0, c='k', alpha=0.5)
    ax.axhline(0, c='k', alpha=0.5)
plt.tight_layout()

#col = cols[0]

# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
    ax = axs[i]
    # Plot values
    sns.regplot(x='z_'+col+'_log', y='lag_z_'+col+'_log', data=df_merged, ci=None, ax=ax)
    # Add vertical and horizontal lines
    ax.axvline(0, c='k', alpha=0.5)
    ax.axhline(0, c='k', alpha=0.5)
plt.tight_layout()

df_merged.columns

mi = esda.Moran(df_merged[cols[3]], w)
mi.I, mi.p_sim, moran_scatterplot(mi, zstandard=True)

calculates_mi = []
mi_list = {}
for i, col in enumerate(cols8):
    #ax.set_aspect('equal')
    mi = esda.Moran(df_merged[col], w, permutations=99)
    #print(mi.I, mi.p_sim, col)
    mi_list[col] = mi
    calculates_mi.append({'var': col, 'I': mi.I, 'p_sim': mi.p_sim, 'p_z_sim': mi.p_z_sim, 'p_rand': mi.p_rand, 'p_norm': mi.p_norm,})
    ## p_sim: against permutation
    ## p_z_sim: against z-standardized permutation
    ## p_rand: against uniform random distribution assumption 
    ## p_norm: against normal/gausian random distribution assumption 

calculates_mi = pd.DataFrame(calculates_mi)
calculates_mi.round(4)

cols_with_log = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming',
       'Weekend_outgoing', 'Weekday_incoming_log', 'Weekday_outgoing_log',
       'Weekend_incoming_log', 'Weekend_outgoing_log']

fig, axg = plt.subplots(4, 2, figsize=(10, 10))
axs = axg.flatten()

for i, col in enumerate(cols_with_log):
    ax = axs[i]
    #ax.set_aspect('equal')
    mi = mi_list[col]
    fig, ax = moran_scatterplot(mi, ax=ax, zstandard=True)
    ax.set_title('{} (MI={:.3f}, pv={:.3f})'.format(col, mi.I, mi.p_sim))
plt.tight_layout()

the top four plots (original values)

have lower I value,
incoming are not significant
outgoing are significant

the bottom four plots (log-transformed)

I values are higher
p-values indicate all are significant
outgoing’s I are higher than incoming

These implied that in the morning commuting hours (6am-9am), the outgoing volumes are significantly clustered.

how about if we use other spatial weights? or knn with different k?

Geospatial Statistics and Visualisation

Lab 7b: Spatial Clusters Detection (real data)

Geospatial Statistics and Visualisation

Lab 9: LISA Analysis