Lab 1: Basic data processing - Geospatial Statistics and Visualisation

Basic processing for data: Reading table and generating histogram plots¶

import libraries and functions¶

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import kurtosis

basic path joining and loading tabulate data¶

data_dir = '../datasets/data_dirs/'  # please change to the file location in your directories
os.listdir(data_dir)

fname = 'hdb_data_2023.csv.xz'  # filename of the file to be red
fp = os.path.join(data_dir, fname)  # join the filename with the location of file
print(fp)  # print to check

df = pd.read_csv(fp, index_col=0)
df.head()  # show the top n rows, n=5 by default

some note on making plots¶

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots, plt.subplots( # row, # column )

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots
plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other

axg

axs = axg.flatten()
axs

axses are the subplots, i.e., how we can point and identify the separate subplot spaces;

in above example, axg is an array with two sub-arrays, wach sub-array contains 3 axes. these arrays correspond to the rows (from top to bottom).

the .flatten() method will convert the multiple sub arrays into a single array, which goes right, then next row, right, next row... goes like a Z steps.

See the two examples below: for axg and the flatten axs

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots

labels = 'abc' 

for i, axs in enumerate(axg):
    for j, ax in enumerate(axs):
        ax.set_title('row {}, col {}'.format(i, labels[j]))

plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots
axs = axg.flatten()  # check this

labels = 'abcdef' 

for i, ax in enumerate(axs):
    ax.set_title(labels[i])

plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other

generating distribution plots¶

sns.boxplot(x='storey_range', y='resale_price', data=df)

df['storey_range'].unique()

fig, ax = plt.subplots(figsize=(10,4))  # figsize (width, height)
sns.boxplot(x='storey_range', y='resale_age',
            color='pink', order=sorted(df['storey_range'].unique()),
            data=df, ax=ax)

xs = ax.get_xticklabels()
print(xs)
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale age against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale age')
#

plt.tight_layout()

clrs = sns.color_palette('colorblind')
clrs

fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='storey_range', y='resale_price',
            color=clrs[1], order=sorted(df['storey_range'].unique()),
            data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()

fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='resale_age', y='resale_price',
            color='pink', order=sorted(df['resale_age'].unique()),
            data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against age')
ax.set_xlabel('Resale age')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()

sns.histplot(x='resale_age', data=df)

sns.kdeplot(x='resale_age', data=df)

sns.histplot(x=df['resale_age'])

sns.histplot(x='resale_age', data=df, bins=[x*5 for x in range(20)])

list(range(20))  # from zero to the integer before the number here (12)

list(range(5, 12))  # from the first number (innclusive, 5) to the integer before the last number here (12)

[x*5 for x in range(13)]  # list comprehension in python

a_list = [] # list()
for x in range(13):
    a_list.append(x*5)
print(a_list)

np.histogram(df['resale_age'])

np.histogram(df['resale_age'], bins=[x*5 for x in range(20)])

data transformation¶

sns.histplot(x='resale_price', data=df, )

vs = df['resale_price'].to_list()
vmin = min(vs)
vmax = max(vs)

min_max_vs = [ (v-vmin) / (vmax - vmin) for v in vs ]
print(min(min_max_vs), max(min_max_vs))
sns.histplot(min_max_vs)


min_max_vs2 = [ (v-vmin) / (vmax - vmin) * (100 - 1) + 1 for v in vs ]
print(min(min_max_vs2), max(min_max_vs2))
sns.histplot(min_max_vs2)

mu = np.mean(vs)
sig = np.std(vs)
std_vs = [ (v - mu) / sig for v in vs ]

print(min(std_vs), max(std_vs))
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_vs, ax=ax)
ax.axvline(x=np.mean(std_vs), ls=':', c='k')
ax.axvline(x=np.mean(std_vs) + np.std(std_vs), ls='--', c='grey')
ax.axvline(x=np.mean(std_vs) - np.std(std_vs), ls='--', c='grey')
plt.tight_layout()

vs_log = np.log(vs)
sns.histplot(vs_log)

mu = np.mean(vs_log)
sig = np.std(vs_log)
std_log_vs = [ (v - mu) / sig for v in vs_log ]

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_log_vs, ax=ax, fc='xkcd:sky blue')
ax.axvline(x=np.mean(std_log_vs), ls='--', c='k', lw=2)
ax.axvline(x=np.mean(std_log_vs) + np.std(std_log_vs), ls='--', c='k')
ax.axvline(x=np.mean(std_log_vs) - np.std(std_log_vs), ls='--', c='k')
plt.tight_layout()

run kurtosis to check the shape¶


kurtosis_fisher = kurtosis(vs, fisher=True)
kurtosis_pearson = kurtosis(vs, fisher=False)

kurtosis_fisher, kurtosis_pearson


kurtosis_fisher = kurtosis(vs_log, fisher=True)
kurtosis_pearson = kurtosis(vs_log, fisher=False)

kurtosis_fisher, kurtosis_pearson


kurtosis_fisher = kurtosis(std_log_vs, fisher=True)
kurtosis_pearson = kurtosis(std_log_vs, fisher=False)

kurtosis_fisher, kurtosis_pearson

fisher: bool, optional
If True, Fisher’s definition is used (normal ==> 0.0).
If False, Pearson’s definition is used (normal ==> 3.0).

Geospatial Statistics and Visualisation

Lab 0: Preparation

Geospatial Statistics and Visualisation

Lab 2: Grouping values