Lab 2: Grouping values - Geospatial Statistics and Visualisation

Use mapclassify to create breaks and groups of values (for choropleth map)¶

import libraries and functions¶

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mapclassify as mc

basic path joining and loading tabulate data¶

data_dir = '../datasets/data_dirs/'  # please change to the file location in your directories
os.listdir(data_dir)

fname = 'hdb_data_2023.csv.xz'  # filename of the file to be red
fp = os.path.join(data_dir, fname)  # join the filename with the location of file
print(fp)  # print to check

df = pd.read_csv(fp, index_col=0)
df.head()  # show the top n rows, n=5 by default

vs = df['resale_price'].to_list()

sns.histplot(x='resale_price', data=df, )

breaks = mc.EqualInterval(vs)
breaks

breaks.yb

breaks.bins

np.histogram(vs, bins=list(breaks.bins))

np.histogram(vs, bins=[150000] + [x for x in list(breaks.bins)])

len(vs)

sum([4967, 15233,  4806,   673,    81]) ,sum([5152, 15182,  4713,   634,    79])

breaks = mc.EqualInterval(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')

plt.tight_layout()

breaks = mc.Quantiles(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()

breaks = mc.NaturalBreaks(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()

breaks = mc.StdMean(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()

breaks = mc.HeadTailBreaks(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()

df['htbreaks_yb'] = breaks.yb

df.head()

for g in df['htbreaks_yb'].unique():
    tmp = df[df['htbreaks_yb']==g]
    print(g, len(tmp))

sns.boxplot(x='htbreaks_yb', y='resale_price', data=df)

Geospatial Statistics and Visualisation

Lab 1: Basic data processing

Geospatial Statistics and Visualisation

Lab 3: T-test and ANOVA