Skip to article frontmatterSkip to article content
import os
import random  # for generating random point
import numpy as np  # for numerical values manipulation (calculation etc.)
import pandas as pd  # for table/spreadsheets manipulation
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd  # for geospatial data / attribute table & geometry manipulation 
from shapely import wkt  # for geometry data manipulation
import matplotlib.pyplot as plt  # the most basic and important for making plots/figures in Python 
import seaborn as sns  # convenient tools for making statistical plots

import pingouin as pg  # for running hypothesis testing (T-test, ANOVA)
## https://pingouin-stats.org/build/html/index.html  # go to heir documentation website to check the syntax and roadmap


#from shapely.geometry import Point  # for manipulating point data
#from scipy import stats  # for calculating statistical index
#from scipy import special  # 
#from tqdm import tqdm  # for the progress bar
#import mapclassify as mc  # for generating categorisation for numerical data
os.listdir('../resources/')

load subzone file

load subzone shapefile

subz = gpd.read_file('../resources/master-plan-2019-subzone_landuse_area.zip')
subz.head()  # get the first 5 rows
subz.tail(2)  # get the last 2 rows
subz.plot()  # show the spatial data
subz.crs  # this is the EPSG:3414 crs
subz.columns  # show columns, seems like it has too many columns/info
## remove some non-needed columns, and keep only some important one
keep_cols = [
    'SUBZONE_N', 'SUBZONE_C', 
    'PLN_AREA_N', 'PLN_AREA_C', 
    'REGION_N', 'REGION_C', 
    'geometry']

subz = subz[keep_cols].copy()  # overriding the object by assigning the same name, this can be risky but here is ok
subz.head()

load HDB data

hdb = pd.read_csv('../resources/hdb_data_2023.csv.xz', index_col=0)  # first (0) column contain 'index' of the rows
hdb.head()  # show the firs 5 rows (by default, is 5 rows)
hdb.tail(10)  # show the last 10 rows
hdb.columns  # to check what columns is included in the csv/dataframe
hdb['geometry'].apply(wkt.loads)  ## this is what this .apply() do
g_hdb = gpd.GeoDataFrame(data=hdb,
                         geometry=hdb['geometry'].apply(wkt.loads), 
                         crs=subz.crs)  # use the same crs as the subzone, make sure they are same before setting it
g_hdb.plot()
## X.sjoin(Y) is the spatial join function, joining the Y to X
## in this case, is to join the polygon info to the point
g_hdb.sjoin(subz)  # testing how it looks, without keeping the test result as a new object
g_hdb2 = g_hdb.sjoin(subz)  # save the  spatial join result as a new geopandas object
# assign it to a new name, which is safer (not overriding)
g_hdb2.head()

try individual transaction analysis

check the resale prices between regions

sns.histplot(g_hdb2['resale_price'])
sns.boxplot(x='REGION_C', y='resale_price', data=g_hdb2)
sns.violinplot(x='REGION_C', y='resale_price', data=g_hdb2)
sns.violinplot(x='floor_area_sqm', y='resale_price', data=g_hdb2)
sns.boxplot(x='flat_type', y='resale_price', data=g_hdb2)
sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2)

let’s rearange things so that it is more intuitive

sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2,
            order=['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM'], 
            hue_order=['NR', 'WR', 'ER', 'NER', 'CR'],
           )
sns.color_palette('muted')
sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2,
            order=['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM'], 
            hue_order=['NR', 'WR', 'ER', 'NER', 'CR'],
            palette=sns.color_palette('bright')[:5]
           )

an interesting point here:

  • CR is lower than other regions for 2 room, about the same for 3 room, and higher than other zones for 4 and 5 rooms

  • for 2/3 rooms, the outliers are visibly higher

is it because of the numbers are less in CR?, let’s check

g_hdb2.groupby('REGION_C')[['resale_price']].count()
g_hdb2.groupby(['flat_type', 'REGION_C'])[['resale_price']].count()

in terms of # transaction, CR doesn’t seem to be different from others note that the word ‘seem’<--means visually speaking

remark 1

so, different flat type may have different price pattern between regions

traditionally we may want to use 2-ways anova, here let’s use what we cover in the class: one-way anova

run one-way ANOVA separately for different flat type, between REGION

g_hdb2['flat_type']=='1 ROOM', len(g_hdb2)
g_hdb2[g_hdb2['flat_type']=='1 ROOM']
ftypes = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']

## declare the empty list for storing results
a_res_all = []  # for storing anova result
ph_res_all = []  # for storing post-hoc result
for ft in ftypes:
    tmp = g_hdb2[g_hdb2['flat_type']==ft]
    #print(ft, len(tmp))
    # run anova
    a_res = pg.anova(data=tmp, dv='resale_price', between='REGION_C',)
    a_res['flat_type'] = ft
    #print(a_res)
    a_res_all.append(a_res) # store anova result
    # run post-hoc 
    ph_res = pg.pairwise_tukey(data=tmp, dv='resale_price', between='REGION_C').round(3)
    #ph_res['p-tukey'] = ph_res['p-tukey'].round(4)
    ph_res['mean(A)'] = ph_res['mean(A)'].round(1)
    ph_res['mean(B)'] = ph_res['mean(B)'].round(1)
    ph_res['diff'] = ph_res['diff'].round(1)
    ph_res['flat_type'] = ft
    #print(ph_res)
    ph_res_all.append(ph_res)  # store post-hoc result
    
a_res_all = pd.concat(a_res_all)  # combine the dataframes in the list into one dataframe
#ph_res_all = pd.concat(ph_res_all)
a_res_all#, ph_res_all
for ph_res in ph_res_all:
    tmp = ph_res[ph_res['p-tukey']<=.05]  # filter to get significant row only
    print(tmp)

summary

  • for 2 room, CR is significantly lower than all others; NER > WR

  • for 3 room, CR is significantly higher than all others (this is not visible); NER > (ER, NR, WR)

  • for 4 and 5 rooms:

    • CR > all

    • ER > most of them

    • NER > NR & WR

    • NR < WR

  • ER and NER, some times one is significantly higher than another, which does not always stand for other flat type

    • this means that, for different flat types, the relationship (> or <) may change between regions

try to see how the different flat type in each region

regs = ['CR', 'ER', 'NR', 'NER', 'WR']

a_res_all2 = []
for reg in regs:
    tmp = g_hdb2[g_hdb2['REGION_C']==reg]
    #print(tmp.head())
    a_res = pg.anova(data=tmp, dv='resale_price', between='flat_type',)
    a_res['REGION_C'] = reg
    a_res_all2.append(a_res)
    print(a_res)
    break

# note the ddof1: the flat type contains 1 room, executive, and multi-gen, which we don't want 
keep_ft = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']
g_hdb3 = g_hdb2[[ft in keep_ft for ft in g_hdb2['flat_type']]].copy()
len(g_hdb3)
regs = ['CR', 'ER', 'NR', 'NER', 'WR']

a_res_all2 = []
ph_res_all2 = []
for reg in regs:
    tmp = g_hdb3[g_hdb3['REGION_C']==reg]
    #print(tmp.head())
    a_res = pg.anova(data=tmp, dv='resale_price', between='flat_type',)
    a_res['REGION_C'] = reg
    a_res_all2.append(a_res)
    #print(a_res)  # check the ddof1 and ddof2
    ph_res = pg.pairwise_tukey(data=tmp, dv='resale_price', between='flat_type').round(3)
    #ph_res['p-tukey'] = ph_res['p-tukey'].round(4)
    ph_res['mean(A)'] = ph_res['mean(A)'].round(1)
    ph_res['mean(B)'] = ph_res['mean(B)'].round(1)
    ph_res['diff'] = ph_res['diff'].round(1)
    ph_res['flat_type'] = reg
    #print(ph_res)
    ph_res_all2.append(ph_res)
    #break

a_res_all2 = pd.concat(a_res_all2)  # combine the dataframes in the list into one dataframe
a_res_all2
for ph in ph_res_all2:
    ph2 = ph[ph['p-tukey']<=.05]
    print(ph)

all regions indicated that 2 room < 3 room < 4 room < 5 room

test run 2-ways anova

pg.anova(data=g_hdb3, dv='resale_price', between=['REGION_C', 'flat_type'], detailed=True)

the two main effects (regions and flat type) are significant, as well as the interaction effect

try aggregated analysis (aggregate by subzone) and mixed anova

mixed means contains both between and within data

  • between: mutually exclusive groups, subjects in one ‘group’ cannot appear in another ‘group’

  • within: every subject must have one value in the all ‘groups’

individual transaction data is mutually exclusive, because all transaction happens in only one location, has one flat type, has one and only one value in all columns.... so there is no ‘within’ group properties in it

to demonstrate mixed anova, we need to aggregate the transaction into some spatial units, then the spatial units can contains multiple transactions that falls in different groups, e.g., different flat types appear in the same subzone. Then, we can calculate the median/mean price for each type in the same subzone.

g_hdb3.head()
g_hdb3.groupby('SUBZONE_N')[['resale_price']].mean()
tmp = (g_hdb3['resale_age']/10).astype(int)*10 
## 0: 0 -- 9
## 10: 10 -- 19 ...
#tmp
g_hdb3['resale_age_group'] = tmp
sns.boxplot(x='resale_age_group', y='resale_price', data=g_hdb3)
g_hdb3.groupby('resale_age_group')[['resale_price']].count()
regroup = {
    0: 0, 
    10: 1, 
    20: 1, 
    30: 2, 
    40: 2,
    50: 2,
}

g_hdb4 = g_hdb3.groupby(['REGION_C', 'SUBZONE_N', 'flat_type'])[['resale_price']].median().reset_index()

g_hdb4  ## long table
g_hdb4b = g_hdb4.pivot(index=['REGION_C', 'SUBZONE_N'], columns='flat_type', values='resale_price').reset_index()
## all 'within group' data can be arranged into wide table, like this, 
## where each row represent a subject, and multiple columns containing the same variable (resale price) for different groups (flat type) 
g_hdb4b  # wide table
for col in ftypes:
    #print(col)
    nan_count_col = g_hdb4b[col].isna().sum()
    print(col, nan_count_col)
# perhaps we should remove 2 room, 87 missing value is more than 50% of the subzones
87/151  # 57.6%
g_hdb4c = g_hdb4b[['REGION_C', 'SUBZONE_N', '3 ROOM', '4 ROOM', '5 ROOM']]
g_hdb4c = g_hdb4c.dropna()
len(g_hdb4c)
g_hdb4c  # the wide table

let’s make some plots

ftypes2 = ['3 ROOM', '4 ROOM', '5 ROOM']

fig, axg = plt.subplots(2, 3, figsize=(12, 8), sharex=False, sharey=True)
axs = axg.flatten()

for i, reg in enumerate(g_hdb4c['REGION_C'].unique()):
    ax = axs[i]
    tmp = g_hdb4c[g_hdb4c['REGION_C']==reg]
    #print(tmp)
    print(i, reg, len(tmp))
    for j, row in tmp.iterrows():
        #print(row)
        vs = [row[ft] for ft in ftypes2]
        #print(vs)
        ax.plot(list(range(3)), vs, c='grey', alpha=.3)
        #break
    tmp2 = tmp[ftypes2].median()
    print(tmp2)
    ax.plot(list(range(3)), tmp2, c='blue', marker='o')
    ax.set_title(reg)
    #break
for ax in axs:
    ax.set_xticks([0, 1, 2])
    ax.set_xticklabels(ftypes2)

axs[-1].axis('off')

plt.tight_layout()
g_hdb4c_long = g_hdb4c.melt(id_vars=['REGION_C', 'SUBZONE_N'])  # convert it back to long table
g_hdb4c_long = g_hdb4c_long.rename(columns={'value': 'resale_price'})
g_hdb4c_long
pg.mixed_anova(data=g_hdb4c_long, dv='resale_price', within='flat_type', subject='SUBZONE_N', between='REGION_C',)
mph = pg.pairwise_tests(data=g_hdb4c_long, dv='resale_price', between='REGION_C', within='flat_type', subject='SUBZONE_N',
                          parametric=True, marginal=True, alpha=0.05, alternative='two-sided', 
                          padjust='none', effsize='hedges', correction='auto', nan_policy='listwise', 
                          return_desc=False, interaction=True, within_first=True)
mph
mph[mph['p-unc']>.05]
mph[mph['p-unc']<=.05]

3 parts:

  • the within group result: Contrast==flat_type

    • 3 room < 4 room < 5 room, which are expected

  • the between group result: Contrast==REGION_C

    • CR > all other regions

    • ER > NR

    • NER > NR and WR

    • NR > WR

  • the interaction: Contrast==flat_type * REGION_C

    • this examine if there is differences on “regional-relationship or regional-differences” between flat types

    • 4 rooms and 5 rooms are same, but 3 rooms have less significant result

mph2 = pg.pairwise_tests(data=g_hdb4c_long, dv='resale_price', between='REGION_C', within='flat_type', subject='SUBZONE_N',
                          parametric=True, marginal=True, alpha=0.05, alternative='two-sided', 
                          padjust='none', effsize='hedges', correction='auto', nan_policy='listwise', 
                          return_desc=False, interaction=True, within_first=False)
mph2
mph2[mph2['p-unc']>.05]

the interaction effect: are there any differences on the “flat types relationships” between regions---they were all significant and same accross regions