import os
import random # for generating random point
import numpy as np # for numerical values manipulation (calculation etc.)
import pandas as pd # for table/spreadsheets manipulation
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd # for geospatial data / attribute table & geometry manipulation
from shapely import wkt # for geometry data manipulation
import matplotlib.pyplot as plt # the most basic and important for making plots/figures in Python
import seaborn as sns # convenient tools for making statistical plots
import pingouin as pg # for running hypothesis testing (T-test, ANOVA)
## https://pingouin-stats.org/build/html/index.html # go to heir documentation website to check the syntax and roadmap
#from shapely.geometry import Point # for manipulating point data
#from scipy import stats # for calculating statistical index
#from scipy import special #
#from tqdm import tqdm # for the progress bar
#import mapclassify as mc # for generating categorisation for numerical dataos.listdir('../resources/')load subzone file¶
load subzone shapefile
subz = gpd.read_file('../resources/master-plan-2019-subzone_landuse_area.zip')
subz.head() # get the first 5 rowssubz.tail(2) # get the last 2 rowssubz.plot() # show the spatial datasubz.crs # this is the EPSG:3414 crssubz.columns # show columns, seems like it has too many columns/info## remove some non-needed columns, and keep only some important one
keep_cols = [
'SUBZONE_N', 'SUBZONE_C',
'PLN_AREA_N', 'PLN_AREA_C',
'REGION_N', 'REGION_C',
'geometry']
subz = subz[keep_cols].copy() # overriding the object by assigning the same name, this can be risky but here is ok
subz.head()load HDB data¶
hdb = pd.read_csv('../resources/hdb_data_2023.csv.xz', index_col=0) # first (0) column contain 'index' of the rows
hdb.head() # show the firs 5 rows (by default, is 5 rows)hdb.tail(10) # show the last 10 rowshdb.columns # to check what columns is included in the csv/dataframehdb['geometry'].apply(wkt.loads) ## this is what this .apply() dog_hdb = gpd.GeoDataFrame(data=hdb,
geometry=hdb['geometry'].apply(wkt.loads),
crs=subz.crs) # use the same crs as the subzone, make sure they are same before setting it
g_hdb.plot()## X.sjoin(Y) is the spatial join function, joining the Y to X
## in this case, is to join the polygon info to the point
g_hdb.sjoin(subz) # testing how it looks, without keeping the test result as a new objectg_hdb2 = g_hdb.sjoin(subz) # save the spatial join result as a new geopandas object
# assign it to a new name, which is safer (not overriding)
g_hdb2.head()try individual transaction analysis¶
check the resale prices between regions
sns.histplot(g_hdb2['resale_price'])sns.boxplot(x='REGION_C', y='resale_price', data=g_hdb2)sns.violinplot(x='REGION_C', y='resale_price', data=g_hdb2)sns.violinplot(x='floor_area_sqm', y='resale_price', data=g_hdb2)sns.boxplot(x='flat_type', y='resale_price', data=g_hdb2)sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2)let’s rearange things so that it is more intuitive
sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2,
order=['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM'],
hue_order=['NR', 'WR', 'ER', 'NER', 'CR'],
)sns.color_palette('muted')sns.boxplot(x='flat_type', y='resale_price', hue='REGION_C', data=g_hdb2,
order=['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM'],
hue_order=['NR', 'WR', 'ER', 'NER', 'CR'],
palette=sns.color_palette('bright')[:5]
)an interesting point here:¶
CR is lower than other regions for 2 room, about the same for 3 room, and higher than other zones for 4 and 5 rooms
for 2/3 rooms, the outliers are visibly higher
is it because of the numbers are less in CR?, let’s check
g_hdb2.groupby('REGION_C')[['resale_price']].count()g_hdb2.groupby(['flat_type', 'REGION_C'])[['resale_price']].count()in terms of # transaction, CR doesn’t seem to be different from others note that the word ‘seem’<--means visually speaking
remark 1¶
so, different flat type may have different price pattern between regions
traditionally we may want to use 2-ways anova, here let’s use what we cover in the class: one-way anova
run one-way ANOVA separately for different flat type, between REGION
g_hdb2['flat_type']=='1 ROOM', len(g_hdb2)g_hdb2[g_hdb2['flat_type']=='1 ROOM']ftypes = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']
## declare the empty list for storing results
a_res_all = [] # for storing anova result
ph_res_all = [] # for storing post-hoc result
for ft in ftypes:
tmp = g_hdb2[g_hdb2['flat_type']==ft]
#print(ft, len(tmp))
# run anova
a_res = pg.anova(data=tmp, dv='resale_price', between='REGION_C',)
a_res['flat_type'] = ft
#print(a_res)
a_res_all.append(a_res) # store anova result
# run post-hoc
ph_res = pg.pairwise_tukey(data=tmp, dv='resale_price', between='REGION_C').round(3)
#ph_res['p-tukey'] = ph_res['p-tukey'].round(4)
ph_res['mean(A)'] = ph_res['mean(A)'].round(1)
ph_res['mean(B)'] = ph_res['mean(B)'].round(1)
ph_res['diff'] = ph_res['diff'].round(1)
ph_res['flat_type'] = ft
#print(ph_res)
ph_res_all.append(ph_res) # store post-hoc result
a_res_all = pd.concat(a_res_all) # combine the dataframes in the list into one dataframe
#ph_res_all = pd.concat(ph_res_all)
a_res_all#, ph_res_allfor ph_res in ph_res_all:
tmp = ph_res[ph_res['p-tukey']<=.05] # filter to get significant row only
print(tmp)summary¶
for 2 room, CR is significantly lower than all others; NER > WR
for 3 room, CR is significantly higher than all others (this is not visible); NER > (ER, NR, WR)
for 4 and 5 rooms:
CR > all
ER > most of them
NER > NR & WR
NR < WR
ER and NER, some times one is significantly higher than another, which does not always stand for other flat type
this means that, for different flat types, the relationship (> or <) may change between regions
try to see how the different flat type in each region¶
regs = ['CR', 'ER', 'NR', 'NER', 'WR']
a_res_all2 = []
for reg in regs:
tmp = g_hdb2[g_hdb2['REGION_C']==reg]
#print(tmp.head())
a_res = pg.anova(data=tmp, dv='resale_price', between='flat_type',)
a_res['REGION_C'] = reg
a_res_all2.append(a_res)
print(a_res)
break
# note the ddof1: the flat type contains 1 room, executive, and multi-gen, which we don't want keep_ft = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM']
g_hdb3 = g_hdb2[[ft in keep_ft for ft in g_hdb2['flat_type']]].copy()
len(g_hdb3)regs = ['CR', 'ER', 'NR', 'NER', 'WR']
a_res_all2 = []
ph_res_all2 = []
for reg in regs:
tmp = g_hdb3[g_hdb3['REGION_C']==reg]
#print(tmp.head())
a_res = pg.anova(data=tmp, dv='resale_price', between='flat_type',)
a_res['REGION_C'] = reg
a_res_all2.append(a_res)
#print(a_res) # check the ddof1 and ddof2
ph_res = pg.pairwise_tukey(data=tmp, dv='resale_price', between='flat_type').round(3)
#ph_res['p-tukey'] = ph_res['p-tukey'].round(4)
ph_res['mean(A)'] = ph_res['mean(A)'].round(1)
ph_res['mean(B)'] = ph_res['mean(B)'].round(1)
ph_res['diff'] = ph_res['diff'].round(1)
ph_res['flat_type'] = reg
#print(ph_res)
ph_res_all2.append(ph_res)
#break
a_res_all2 = pd.concat(a_res_all2) # combine the dataframes in the list into one dataframe
a_res_all2for ph in ph_res_all2:
ph2 = ph[ph['p-tukey']<=.05]
print(ph)all regions indicated that 2 room < 3 room < 4 room < 5 room
test run 2-ways anova¶
pg.anova(data=g_hdb3, dv='resale_price', between=['REGION_C', 'flat_type'], detailed=True)the two main effects (regions and flat type) are significant, as well as the interaction effect
try aggregated analysis (aggregate by subzone) and mixed anova¶
mixed means contains both between and within data
between: mutually exclusive groups, subjects in one ‘group’ cannot appear in another ‘group’
within: every subject must have one value in the all ‘groups’
individual transaction data is mutually exclusive, because all transaction happens in only one location, has one flat type, has one and only one value in all columns.... so there is no ‘within’ group properties in it
to demonstrate mixed anova, we need to aggregate the transaction into some spatial units, then the spatial units can contains multiple transactions that falls in different groups, e.g., different flat types appear in the same subzone. Then, we can calculate the median/mean price for each type in the same subzone.
g_hdb3.head()g_hdb3.groupby('SUBZONE_N')[['resale_price']].mean()tmp = (g_hdb3['resale_age']/10).astype(int)*10
## 0: 0 -- 9
## 10: 10 -- 19 ...
#tmp
g_hdb3['resale_age_group'] = tmpsns.boxplot(x='resale_age_group', y='resale_price', data=g_hdb3)g_hdb3.groupby('resale_age_group')[['resale_price']].count()regroup = {
0: 0,
10: 1,
20: 1,
30: 2,
40: 2,
50: 2,
}
g_hdb4 = g_hdb3.groupby(['REGION_C', 'SUBZONE_N', 'flat_type'])[['resale_price']].median().reset_index()
g_hdb4 ## long tableg_hdb4b = g_hdb4.pivot(index=['REGION_C', 'SUBZONE_N'], columns='flat_type', values='resale_price').reset_index()
## all 'within group' data can be arranged into wide table, like this,
## where each row represent a subject, and multiple columns containing the same variable (resale price) for different groups (flat type)
g_hdb4b # wide tablefor col in ftypes:
#print(col)
nan_count_col = g_hdb4b[col].isna().sum()
print(col, nan_count_col)
# perhaps we should remove 2 room, 87 missing value is more than 50% of the subzones87/151 # 57.6%g_hdb4c = g_hdb4b[['REGION_C', 'SUBZONE_N', '3 ROOM', '4 ROOM', '5 ROOM']]
g_hdb4c = g_hdb4c.dropna()
len(g_hdb4c)g_hdb4c # the wide tablelet’s make some plots
ftypes2 = ['3 ROOM', '4 ROOM', '5 ROOM']
fig, axg = plt.subplots(2, 3, figsize=(12, 8), sharex=False, sharey=True)
axs = axg.flatten()
for i, reg in enumerate(g_hdb4c['REGION_C'].unique()):
ax = axs[i]
tmp = g_hdb4c[g_hdb4c['REGION_C']==reg]
#print(tmp)
print(i, reg, len(tmp))
for j, row in tmp.iterrows():
#print(row)
vs = [row[ft] for ft in ftypes2]
#print(vs)
ax.plot(list(range(3)), vs, c='grey', alpha=.3)
#break
tmp2 = tmp[ftypes2].median()
print(tmp2)
ax.plot(list(range(3)), tmp2, c='blue', marker='o')
ax.set_title(reg)
#break
for ax in axs:
ax.set_xticks([0, 1, 2])
ax.set_xticklabels(ftypes2)
axs[-1].axis('off')
plt.tight_layout()
g_hdb4c_long = g_hdb4c.melt(id_vars=['REGION_C', 'SUBZONE_N']) # convert it back to long table
g_hdb4c_long = g_hdb4c_long.rename(columns={'value': 'resale_price'})
g_hdb4c_longpg.mixed_anova(data=g_hdb4c_long, dv='resale_price', within='flat_type', subject='SUBZONE_N', between='REGION_C',)mph = pg.pairwise_tests(data=g_hdb4c_long, dv='resale_price', between='REGION_C', within='flat_type', subject='SUBZONE_N',
parametric=True, marginal=True, alpha=0.05, alternative='two-sided',
padjust='none', effsize='hedges', correction='auto', nan_policy='listwise',
return_desc=False, interaction=True, within_first=True)
mphmph[mph['p-unc']>.05]mph[mph['p-unc']<=.05]3 parts:
the within group result: Contrast==flat_type
3 room < 4 room < 5 room, which are expected
the between group result: Contrast==REGION_C
CR > all other regions
ER > NR
NER > NR and WR
NR > WR
the interaction: Contrast==flat_type * REGION_C
this examine if there is differences on “regional-relationship or regional-differences” between flat types
4 rooms and 5 rooms are same, but 3 rooms have less significant result
mph2 = pg.pairwise_tests(data=g_hdb4c_long, dv='resale_price', between='REGION_C', within='flat_type', subject='SUBZONE_N',
parametric=True, marginal=True, alpha=0.05, alternative='two-sided',
padjust='none', effsize='hedges', correction='auto', nan_policy='listwise',
return_desc=False, interaction=True, within_first=False)
mph2mph2[mph2['p-unc']>.05]the interaction effect: are there any differences on the “flat types relationships” between regions---they were all significant and same accross regions