!pip install -U pingouinfrom google.colab import drive
drive.mount('/content/drive')import os
import numpy as np
import pandas as pd
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
## https://pingouin-stats.org/build/html/index.html
#from shapely.geometry import Point
#from scipy import stats
#from scipy import special
#from tqdm import tqdm
#import mapclassify as mcdata_dir = './drive/MyDrive/GE5230-2510/data_base/'
os.listdir(data_dir)gdf_pa = gpd.read_file(os.path.join(data_dir, 'master-plan-2019-planningarea.shp'))
gdf_pa = gdf_pa.to_crs('epsg:3414')
gdf_pa.plot()gdf_pa.head()gdf_reg = gdf_pa.dissolve('REGION_N').reset_index()
gdf_reg.head()gdf_reg = gdf_reg[['geometry', 'REGION_N', 'REGION_C']]
gdf_reg.head()gdf_reg.plot()f = os.path.join(data_dir, 'hdb_data_2023.csv.xz')
df23 = pd.read_csv(f, index_col=0)
df23.head()f = os.path.join(data_dir, 'hdb_data_2013.csv.xz')
df13 = pd.read_csv(f, index_col=0)
df13.head()df = pd.concat([df13, df23])df.head()gdf = gpd.GeoDataFrame(data=df,
geometry=df['geometry'].apply(wkt.loads), crs='epsg:3414')
gdf.plot()fig, ax = plt.subplots()
gdf_reg.plot(fc='xkcd:off white', ax=ax)
gdf.plot(color='k', ax=ax)
gdf_reg.plot(fc='none', ec='red', ax=ax)df['town'].unique()gdf2 = gdf.sjoin(gdf_reg)
gdf2.head()gdf2.plot('REGION_N')gdf13 = gdf2[gdf2['resale_year']==2013]
gdf23 = gdf2[gdf2['resale_year']==2023]
len(gdf13), len(gdf23), len(df13), len(df23)gdf13.plot('resale_price', 'YlOrRd', vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max())gdf23.plot('resale_price', 'YlOrRd', vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max())gdf13 = gdf13.sort_values(by='resale_price')
gdf23 = gdf23.sort_values(by='resale_price')fig, axs = plt.subplots(2, 1, figsize=(12, 12))
for ax in axs:
gdf_reg.plot(fc='xkcd:off white', ax=ax)
ax.set_xticks([])
ax.set_yticks([])
gdf13.plot('resale_price', 'Blues',
vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[0],
legend=True,
)
gdf23.plot('resale_price', 'Blues',
vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[1],
legend=True,
)
titles = [2013, 2023]
for i, ax in enumerate(axs):
gdf_reg.plot(fc='none', ec='red', ax=ax)
ax.set_title(titles[i])
plt.tight_layout()
fig.savefig('HDB_price_points_13_23.png', dpi=200, bbox_inches='tight')fig, axs = plt.subplots(1, 2, figsize=(18, 5))
for ax in axs:
gdf_reg.plot(fc='xkcd:off white', ax=ax)
ax.set_xticks([])
ax.set_yticks([])
gdf13.plot('resale_price', 'Blues',
vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[0],
legend=True,
)
gdf23.plot('resale_price', 'Blues',
vmin=gdf2['resale_price'].min(), vmax=gdf2['resale_price'].max(), ax=axs[1],
legend=True,
)
titles = [2013, 2023]
for i, ax in enumerate(axs):
gdf_reg.plot(fc='none', ec='red', ax=ax)
ax.set_title(titles[i])
plt.tight_layout()
fig.savefig('HDB_price_points_13_23.png', dpi=200, bbox_inches='tight')for running t-test¶
gdf23.head()sns.kdeplot(x='resale_price', hue='REGION_N', data=gdf23)
#sns.kdeplot(x='resale_price', hue='REGION_N', data=gdf2, hue_order=['EAST REGION', 'WEST REGION'])df_east = gdf23[gdf23['REGION_C']=='ER']
df_west = gdf23[gdf23['REGION_C']=='WR']
df_central = gdf23[gdf23['REGION_C']=='CR']
df_north = gdf23[gdf23['REGION_C']=='NR']
df_northeast = gdf23[gdf23['REGION_C']=='NER']
#tmp = pd.concat([df_east, df_west])
#len(tmp), len(df_east), len(df_west)pg.ttest(df_east['resale_price'], df_west['resale_price'],
paired=False, alternative='two-sided',
correction='auto', confidence=0.95)pg.ttest(df_east['resale_price'], df_west['resale_price'],
paired=False, alternative='greater',
correction='auto', confidence=0.95)pg.ttest(df_east['resale_price'], df_west['resale_price'],
paired=False, alternative='less',
correction='auto', confidence=0.95)east’s transactions are more expensive than west
pg.ttest(df_north['resale_price'], df_west['resale_price'],
paired=False, alternative='less',
correction='auto', r=0.707, confidence=0.95)for running ANOVA + post-hoc¶
pg.homoscedasticity(gdf23, dv='resale_price', group='REGION_C', method='levene', alpha=0.05)pg.homoscedasticity(gdf23, dv='resale_price', group='REGION_C', method='bartlett', alpha=0.05)both methods indicate the variances are not equal
pg.welch_anova(data=gdf23, dv='resale_price', between='REGION_C')pg.anova(data=gdf23, dv='resale_price', between='REGION_C', ss_type=2, detailed=False, effsize='np2') # not accuratepg.pairwise_gameshowell(data=gdf23, dv='resale_price', between='REGION_C', effsize='hedges')pg.pairwise_tukey(data=gdf23, dv='resale_price', between='REGION_C', effsize='hedges') # not accuratefor paired t test¶
gdf3 = gdf.sjoin(gdf_pa)
gdf3.plot('PLN_AREA_N', cmap='tab20')gdf3pa_df_long = gdf3.groupby(['resale_year', 'PLN_AREA_N'])[['resale_price']].mean().reset_index()
pa_df_longpa_df = pa_df_long.pivot(index='PLN_AREA_N', columns='resale_year', values='resale_price')
pa_dflen(pa_df)pg.ttest(pa_df[2013], pa_df[2023], paired=True, alternative='less',
correction='auto', r=0.707, confidence=0.95)for running repeated measure ANOVA (within group)¶
gdf3.head()ft_df_long = gdf3.groupby(['resale_year', 'flat_type', 'PLN_AREA_N'])[['resale_price']].mean().reset_index()
ft_df_longft_df_long23 = ft_df_long[ft_df_long['resale_year']==2023]
ft_df_long23keep_type = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']
ft_df_long23 = ft_df_long23[[ f in keep_type for f in ft_df_long23['flat_type']]]
ft_df_long23ft_df_23 = ft_df_long23.pivot(index='PLN_AREA_N', columns='flat_type', values='resale_price')
ft_df_23pg.rm_anova(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', correction='auto', detailed=False, effsize='ng2')pg.rm_anova(data=ft_df_23, correction='auto', detailed=False, effsize='ng2')2 ways to run repeated measure ANOVA: long table and wide table
pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='two-sided', )pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='greater', )pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=True, marginal=True, alpha=0.05, alternative='less', )A is cheapter than B
now the Non-parametric version (Friedman)¶
as alternative to rm-anova
pg.friedman(ft_df_23)pg.friedman(ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N')pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='two-sided', )pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='less', )pg.pairwise_tests(data=ft_df_long23, dv='resale_price', within='flat_type', subject='PLN_AREA_N', parametric=False, marginal=True, alpha=0.05, alternative='greater', )Kruskall-Wallis test¶
alternative to ANOVA
gdf23.head()pg.kruskal(data=gdf23, dv='resale_price', between='REGION_C', detailed=False)pg.pairwise_tests(data=gdf23, dv='resale_price', between='REGION_C', subject='PLN_AREA_N',
parametric=False, marginal=True, alpha=0.05, alternative='two-sided', )only one not significant