# import os
import pandas as pd
import geopandas as gpd
# for plotting the spatial relationships
from shapely.geometry import LineString
## the pysal weights and Moran's I calculation
# from pysal.lib import weights
from libpysal import weights
#from libpysal.weights import higher_order
#from pysal.explore import esda # esda: Exploratory Spatial Data Analysis
import esda
from splot.esda import moran_scatterplot
# Plotting functions
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdmweights.Queen, weights.higher_order#!pip install pysaldf = pd.read_csv('../resources/ODIO_all_202306.csv.xz', index_col=0) # <-- new file, train and bus ridership in SG
df.head()df2 = df[df['Hour_of_day']>=6] # get morning peak
df2 = df2[df2['Hour_of_day']<9] # get morning peak, before office hour
len(df2)df3 = df2.groupby('subz')[['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']].sum().reset_index()
df3look at the data, all are Poisson-like
cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
sns.histplot(df3[col])how about log-transform?
cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
sns.histplot(np.log(df3[col]))cols = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming', 'Weekend_outgoing']
for col in cols:
df3[col+'_log'] = np.log(df3[col]+1) # +1 to avoid log(0) NaN problemsubz = gpd.read_file('../resources/MP14_SUBZONE_WEB_PL.zip') # <-- new file, Master Plan 2014
subz = subz.to_crs('epsg:3414')
subz = subz[['SUBZONE_N', 'PLN_AREA_N', 'REGION_N', 'REGION_C', 'geometry']]
print(subz.head())
subz.plot()len(subz), len(df3) # ridership data has less subzonesdf_merged = subz.merge(df3, left_on='SUBZONE_N', right_on='subz')
len(df_merged) # all subzones from ridership dataset successfully joined, some empty subzones are ignoreddf_merged.columnscols8 = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming',
'Weekend_outgoing', 'Weekday_incoming_log', 'Weekday_outgoing_log',
'Weekend_incoming_log', 'Weekend_outgoing_log']
for col in cols8:
df_merged.plot(col, cmap='Greys') # just a quick mapping
## how to put them together?
# fig, axg = plt.subplots(4, 2, figsize=(10, 12))
df_merged.head()let’s try standardizing them
def standardize(pd_series):
#m = pd_series.mean()
#s = pd_series.std()
#vs = [(v-m)/s for v in pd_series]
standardized_series = (pd_series - pd_series.mean()) / pd_series.std()
return standardized_seriesfor col in cols8:
df_merged['z_'+col] = standardize(df_merged[col])
df_merged.plot('z_'+col, cmap='Greys')
for col in cols:
sns.histplot(df_merged['z_'+col])
for col in cols:
sns.histplot(df_merged[col])
for col in cols:
sns.histplot(df_merged['z_'+col+'_log'])
for col in cols:
sns.histplot(df_merged[col+'_log'])the above is how the data looks, now let’s go checking the spatial weight
let’s try to draw some lines to connect the ‘neighborhood’ from each subzone
subz.head()subz_centers = subz.geometry.centroid
subz_centers = {k: p for k, p in zip(subz['SUBZONE_N'], subz_centers)}
print(subz_centers['TAMPINES EAST'])this_w = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N') # based on queen contiguity
this_lines = []
for k in df_merged['SUBZONE_N']:
ns = this_w[k]
p0 = subz_centers[k]
for n in ns:
p1 = subz_centers[n]
this_lines.append({'subz': k, 'neighbor': n, 'geometry': LineString([p0, p1])})
this_lines = pd.DataFrame.from_dict(this_lines)
this_lines = gpd.GeoDataFrame(this_lines, geometry=this_lines['geometry'], crs=subz.crs)
fig, ax = plt.subplots()
df_merged.plot(ax=ax)
this_lines.plot(ax=ax, ec='k')this_w['NORTH COAST']len(list(this_w))for i, k in enumerate(this_w):
print(k)
if i>=5:
breakdef gen_lines(this_w):
# subz and subz_centers are comming from the global variables, which may not work if this function is moved to an individual script
this_lines = []
for k in df_merged['SUBZONE_N']:
ns = this_w[k]
p0 = subz_centers[k]
for n in ns:
p1 = subz_centers[n]
this_lines.append({'subz': k, 'neighbor': n, 'geometry': LineString([p0, p1])})
this_lines = pd.DataFrame.from_dict(this_lines)
this_lines = gpd.GeoDataFrame(this_lines, geometry=this_lines['geometry'], crs=subz.crs)
return this_linesw1 = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N') # queen contiguity
w2 = higher_order(w1, k=2, lower_order=True) # 2nd level queen contiguity
w3 = weights.KNN.from_dataframe(df_merged, ids='SUBZONE_N', k=10) # k-nearest neighbors
w4 = weights.DistanceBand.from_dataframe(df_merged, ids='SUBZONE_N', threshold=2000) # distance-based (search radius/buffer)
w5 = weights.Rook.from_dataframe(df_merged, ids='SUBZONE_N') # rook contiguity
w6 = higher_order(w5, k=2, lower_order=True) # 2nd level rook contiguity
lines1 = gen_lines(w1)
lines2 = gen_lines(w2)
lines3 = gen_lines(w3)
lines4 = gen_lines(w4)
lines5 = gen_lines(w5)
lines6 = gen_lines(w6)## some colors from Catppuccin: https://github.com/catppuccin/catppuccin
LAV = '#7287fd'
MAR = '#e64553'
BLK = '#4c4f69'
GRN = '#40a02b'
GRN2 = '#a6e3a1'
WHT = '#dce0e8'
WHT2 = '#eff1f5'
fig, axg = plt.subplots(3, 2, figsize=(12, 12))
axs = axg.flatten()
lines = [lines5, lines6, lines1, lines2, lines3, lines4]
spatial_defs = [
'Rook (1st level)', 'Rook (1&2 levels)',
'Queen (1st level)', 'Queen (1&2 levels)',
'k-nearest neighbor (k=10)', 'Distance Band (threshold=2000m)'
]
for i, ln in enumerate(lines):
ax = axs[i]
df_merged.plot(ax=ax, fc=LAV, alpha=.9)
ln.plot(ax=ax, ec=BLK)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('Neighborhood of {}'.format(spatial_defs[i]))
plt.tight_layout()back to Moran’s I
w1 = weights.Queen.from_dataframe(df_merged, ids='SUBZONE_N') # queen contiguity
w2 = higher_order(w1, k=2, lower_order=True) # 2nd level queen contiguity
w3 = weights.KNN.from_dataframe(df_merged, ids='SUBZONE_N', k=10) # k-nearest neighbors
w4 = weights.DistanceBand.from_dataframe(df_merged, ids='SUBZONE_N', threshold=2000) # distance-based (search radius/buffer)
w5 = weights.Rook.from_dataframe(df_merged, ids='SUBZONE_N') # rook contiguity
w6 = higher_order(w5, k=2, lower_order=True) # 2nd level rook contiguity# let's work with KNN
w =w3
#w.transform = 'R' # row standardized >> can be done during the calculation of Moran's Ifor col in cols8:
print(col)
df_merged['lag_'+col] = weights.lag_spatial(w, df_merged[col])
df_merged['lag_z_'+col] = weights.lag_spatial(w, df_merged['z_'+col])col = cols8[-1]
sns.regplot(x=col, y='lag_'+col, data=df_merged, ci=None)#col = cols[0]
# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
ax = axs[i]
# Plot values
sns.regplot(x=col, y='lag_'+col, data=df_merged, ci=None, ax=ax)
# Add vertical and horizontal lines
#ax.axvline(0, c='k', alpha=0.5)
#ax.axhline(0, c='k', alpha=0.5)#col = cols[0]
# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
ax = axs[i]
# Plot values
sns.regplot(x=col+'_log', y='lag_'+col+'_log', data=df_merged, ci=None, ax=ax)
# Add vertical and horizontal lines
#ax.axvline(0, c='k', alpha=0.5)
#ax.axhline(0, c='k', alpha=0.5)#col = cols[0]
# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
ax = axs[i]
# Plot values
sns.regplot(x='z_'+col, y='lag_z_'+col, data=df_merged, ci=None, ax=ax, )
# Add vertical and horizontal lines
ax.axvline(0, c='k', alpha=0.5)
ax.axhline(0, c='k', alpha=0.5)
plt.tight_layout()#col = cols[0]
# Setup the figure and axis
fig, axg = plt.subplots(2, 2, figsize=(8, 8))
axs = axg.flatten()
for i, col in enumerate(cols):
ax = axs[i]
# Plot values
sns.regplot(x='z_'+col+'_log', y='lag_z_'+col+'_log', data=df_merged, ci=None, ax=ax)
# Add vertical and horizontal lines
ax.axvline(0, c='k', alpha=0.5)
ax.axhline(0, c='k', alpha=0.5)
plt.tight_layout()df_merged.columnsmi = esda.Moran(df_merged[cols[3]], w)
mi.I, mi.p_sim, moran_scatterplot(mi, zstandard=True)calculates_mi = []
mi_list = {}
for i, col in enumerate(cols8):
#ax.set_aspect('equal')
mi = esda.Moran(df_merged[col], w, permutations=99)
#print(mi.I, mi.p_sim, col)
mi_list[col] = mi
calculates_mi.append({'var': col, 'I': mi.I, 'p_sim': mi.p_sim, 'p_z_sim': mi.p_z_sim, 'p_rand': mi.p_rand, 'p_norm': mi.p_norm,})
## p_sim: against permutation
## p_z_sim: against z-standardized permutation
## p_rand: against uniform random distribution assumption
## p_norm: against normal/gausian random distribution assumption
calculates_mi = pd.DataFrame(calculates_mi)
calculates_mi.round(4)cols_with_log = ['Weekday_incoming', 'Weekday_outgoing', 'Weekend_incoming',
'Weekend_outgoing', 'Weekday_incoming_log', 'Weekday_outgoing_log',
'Weekend_incoming_log', 'Weekend_outgoing_log']
fig, axg = plt.subplots(4, 2, figsize=(10, 10))
axs = axg.flatten()
for i, col in enumerate(cols_with_log):
ax = axs[i]
#ax.set_aspect('equal')
mi = mi_list[col]
fig, ax = moran_scatterplot(mi, ax=ax, zstandard=True)
ax.set_title('{} (MI={:.3f}, pv={:.3f})'.format(col, mi.I, mi.p_sim))
plt.tight_layout()the top four plots (original values)
have lower I value,
incoming are not significant
outgoing are significant
the bottom four plots (log-transformed)
I values are higher
p-values indicate all are significant
outgoing’s I are higher than incoming
These implied that in the morning commuting hours (6am-9am), the outgoing volumes are significantly clustered.
how about if we use other spatial weights? or knn with different k?