Skip to article frontmatterSkip to article content
!pip install -U pingouin
from google.colab import drive
drive.mount('/content/drive')
import os
import numpy as np
import pandas as pd
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt
import seaborn as sns

import pingouin as pg
## https://pingouin-stats.org/build/html/index.html


#from shapely.geometry import Point
#from scipy import stats
#from scipy import special
#from tqdm import tqdm
#import mapclassify as mc
data_dir = './drive/MyDrive/GE5230-2510/data_base/'
os.listdir(data_dir)
gdf_pa = gpd.read_file(os.path.join(data_dir, 'master-plan-2019-planningarea.shp'))
gdf_pa = gdf_pa.to_crs('epsg:3414')
gdf_pa.plot()
gdf_pa.head()
gdf_reg = gdf_pa.dissolve('REGION_N').reset_index()
gdf_reg.head()
gdf_reg = gdf_reg[['geometry', 'REGION_N', 'REGION_C']]
gdf_reg.head()
gdf_reg.plot()
f = os.path.join(data_dir, 'hdb_data_2023.csv.xz')
df23 = pd.read_csv(f, index_col=0)
df23.head()
f = os.path.join(data_dir, 'hdb_data_2013.csv.xz')
df13 = pd.read_csv(f, index_col=0)
df13.head()
df = pd.concat([df13, df23])
df.head()
gdf = gpd.GeoDataFrame(data=df,
                       geometry=df['geometry'].apply(wkt.loads), crs='epsg:3414')
gdf.plot()
fig, ax = plt.subplots()
gdf_reg.plot(fc='xkcd:off white', ax=ax)
gdf.plot(color='k', ax=ax)
gdf_reg.plot(fc='none', ec='red', ax=ax)
df['town'].unique()
gdf2 = gdf.sjoin(gdf_reg)
gdf2.head()
gdf2.plot('REGION_N')
gdf13 = gdf2[gdf2['resale_year']==2013]
gdf23 = gdf2[gdf2['resale_year']==2023]
len(gdf13), len(gdf23), len(df13), len(df23)
gdf13.plot('resale_price', 'YlOrRd', vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max())
gdf23.plot('resale_price', 'YlOrRd', vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max())
gdf13 = gdf13.sort_values(by='resale_price')
gdf23 = gdf23.sort_values(by='resale_price')
fig, axs = plt.subplots(2, 1, figsize=(12, 12))
for ax in axs:
    gdf_reg.plot(fc='xkcd:off white', ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])

gdf13.plot('resale_price', 'Blues',
            vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[0],
            legend=True,
            )
gdf23.plot('resale_price', 'Blues',
            vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[1],
            legend=True,
            )
titles = [2013, 2023]
for i, ax in enumerate(axs):
    gdf_reg.plot(fc='none', ec='red', ax=ax)
    ax.set_title(titles[i])
plt.tight_layout()
fig.savefig('HDB_price_points_13_23.png', dpi=200, bbox_inches='tight')
fig, axs = plt.subplots(1, 2, figsize=(18, 5))
for ax in axs:
    gdf_reg.plot(fc='xkcd:off white', ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])

gdf13.plot('resale_price', 'Blues',
            vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[0],
            legend=True,
            )
gdf23.plot('resale_price', 'Blues',
            vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[1],
            legend=True,
            )
titles = [2013, 2023]
for i, ax in enumerate(axs):
    gdf_reg.plot(fc='none', ec='red', ax=ax)
    ax.set_title(titles[i])
plt.tight_layout()
fig.savefig('HDB_price_points_13_23.png', dpi=200, bbox_inches='tight')

for running t-test

gdf23.head()
sns.kdeplot(x='resale_price', hue='REGION_N', data=gdf23)
#sns.kdeplot(x='resale_price', hue='REGION_N', data=gdf2, hue_order=['EAST REGION', 'WEST REGION'])
df_east = gdf23[gdf23['REGION_C']=='ER']
df_west = gdf23[gdf23['REGION_C']=='WR']
df_central = gdf23[gdf23['REGION_C']=='CR']
df_north = gdf23[gdf23['REGION_C']=='NR']
df_northeast = gdf23[gdf23['REGION_C']=='NER']
#tmp = pd.concat([df_east, df_west])
#len(tmp), len(df_east), len(df_west)
pg.ttest(df_east['resale_price'], df_west['resale_price'],
         paired=False, alternative='two-sided',
         correction='auto', confidence=0.95)
pg.ttest(df_east['resale_price'], df_west['resale_price'],
         paired=False, alternative='greater',
         correction='auto', confidence=0.95)
pg.ttest(df_east['resale_price'], df_west['resale_price'],
         paired=False, alternative='less',
         correction='auto', confidence=0.95)

east’s transactions are more expensive than west

pg.ttest(df_north['resale_price'], df_west['resale_price'],
         paired=False, alternative='less',
         correction='auto', r=0.707, confidence=0.95)

for running ANOVA + post-hoc

pg.homoscedasticity(gdf23, dv='resale_price', group='REGION_C', method='levene', alpha=0.05)
pg.homoscedasticity(gdf23, dv='resale_price', group='REGION_C', method='bartlett', alpha=0.05)

both methods indicate the variances are not equal

pg.welch_anova(data=gdf23, dv='resale_price', between='REGION_C')
pg.anova(data=gdf23, dv='resale_price', between='REGION_C', ss_type=2, detailed=False, effsize='np2')  # not accurate
pg.pairwise_gameshowell(data=gdf23, dv='resale_price', between='REGION_C', effsize='hedges')
pg.pairwise_tukey(data=gdf23, dv='resale_price', between='REGION_C', effsize='hedges')  # not accurate

for paired t test

gdf3 = gdf.sjoin(gdf_pa)
gdf3.plot('PLN_AREA_N', cmap='tab20')
gdf3
pa_df_long = gdf3.groupby(['resale_year', 'PLN_AREA_N'])[['resale_price']].mean().reset_index()
pa_df_long
pa_df = pa_df_long.pivot(index='PLN_AREA_N', columns='resale_year', values='resale_price')
pa_df
len(pa_df)
pg.ttest(pa_df[2013], pa_df[2023], paired=True, alternative='less',
         correction='auto', r=0.707, confidence=0.95)

for running repeated measure ANOVA (within group)

gdf3.head()
ft_df_long = gdf3.groupby(['resale_year', 'flat_type', 'PLN_AREA_N'])[['resale_price']].mean().reset_index()
ft_df_long
ft_df_long23 = ft_df_long[ft_df_long['resale_year']==2023]
ft_df_long23
keep_type = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']
ft_df_long23 = ft_df_long23[[ f in keep_type for f in ft_df_long23['flat_type']]]
ft_df_long23
ft_df_23 = ft_df_long23.pivot(index='PLN_AREA_N', columns='flat_type', values='resale_price')
ft_df_23
pg.rm_anova(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', correction='auto', detailed=False, effsize='ng2')
pg.rm_anova(data=ft_df_23, correction='auto', detailed=False, effsize='ng2')

2 ways to run repeated measure ANOVA: long table and wide table

pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='two-sided', )
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='greater', )
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='less', )

A is cheapter than B

now the Non-parametric version (Friedman)

as alternative to rm-anova

pg.friedman(ft_df_23)
pg.friedman(ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N')
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='two-sided', )
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='less', )
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='greater', )

Kruskall-Wallis test

alternative to ANOVA

gdf23.head()
pg.kruskal(data=gdf23, dv='resale_price', between='REGION_C', detailed=False)
pg.pairwise_tests(data=gdf23, dv='resale_price', between='REGION_C', subject='PLN_AREA_N',
                  parametric=False, marginal=True, alpha=0.05, alternative='two-sided', )

only one not significant