Skip to article frontmatterSkip to article content

Basic processing for data: Reading table and generating histogram plots

import libraries and functions

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import kurtosis

basic path joining and loading tabulate data

data_dir = '../datasets/data_dirs/'  # please change to the file location in your directories
os.listdir(data_dir)
fname = 'hdb_data_2023.csv.xz'  # filename of the file to be red
fp = os.path.join(data_dir, fname)  # join the filename with the location of file
print(fp)  # print to check

df = pd.read_csv(fp, index_col=0)
df.head()  # show the top n rows, n=5 by default

some note on making plots

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots, plt.subplots( # row, # column )
fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots
plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other
axg
axs = axg.flatten()
axs

axses are the subplots, i.e., how we can point and identify the separate subplot spaces;

in above example, axg is an array with two sub-arrays, wach sub-array contains 3 axes. these arrays correspond to the rows (from top to bottom).

the .flatten() method will convert the multiple sub arrays into a single array, which goes right, then next row, right, next row... goes like a Z steps.

See the two examples below: for axg and the flatten axs

fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots

labels = 'abc' 

for i, axs in enumerate(axg):
    for j, ax in enumerate(axs):
        ax.set_title('row {}, col {}'.format(i, labels[j]))

plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other
fig, axg = plt.subplots(2, 3)  # to create 2 rows, 3 columns subplots
axs = axg.flatten()  # check this

labels = 'abcdef' 

for i, ax in enumerate(axs):
    ax.set_title(labels[i])

plt.tight_layout()  # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each other

generating distribution plots

sns.boxplot(x='storey_range', y='resale_price', data=df)
df['storey_range'].unique()
fig, ax = plt.subplots(figsize=(10,4))  # figsize (width, height)
sns.boxplot(x='storey_range', y='resale_age',
            color='pink', order=sorted(df['storey_range'].unique()),
            data=df, ax=ax)

xs = ax.get_xticklabels()
print(xs)
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale age against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale age')
#

plt.tight_layout()
clrs = sns.color_palette('colorblind')
clrs
fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='storey_range', y='resale_price',
            color=clrs[1], order=sorted(df['storey_range'].unique()),
            data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()
fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='resale_age', y='resale_price',
            color='pink', order=sorted(df['resale_age'].unique()),
            data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against age')
ax.set_xlabel('Resale age')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()
sns.histplot(x='resale_age', data=df)
sns.kdeplot(x='resale_age', data=df)
sns.histplot(x=df['resale_age'])
sns.histplot(x='resale_age', data=df, bins=[x*5 for x in range(20)])
list(range(20))  # from zero to the integer before the number here (12)
list(range(5, 12))  # from the first number (innclusive, 5) to the integer before the last number here (12)
[x*5 for x in range(13)]  # list comprehension in python
a_list = [] # list()
for x in range(13):
    a_list.append(x*5)
print(a_list)
np.histogram(df['resale_age'])
np.histogram(df['resale_age'], bins=[x*5 for x in range(20)])

data transformation

sns.histplot(x='resale_price', data=df, )
vs = df['resale_price'].to_list()
vmin = min(vs)
vmax = max(vs)

min_max_vs = [ (v-vmin) / (vmax - vmin) for v in vs ]
print(min(min_max_vs), max(min_max_vs))
sns.histplot(min_max_vs)

min_max_vs2 = [ (v-vmin) / (vmax - vmin) * (100 - 1) + 1 for v in vs ]
print(min(min_max_vs2), max(min_max_vs2))
sns.histplot(min_max_vs2)
mu = np.mean(vs)
sig = np.std(vs)
std_vs = [ (v - mu) / sig for v in vs ]

print(min(std_vs), max(std_vs))
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_vs, ax=ax)
ax.axvline(x=np.mean(std_vs), ls=':', c='k')
ax.axvline(x=np.mean(std_vs) + np.std(std_vs), ls='--', c='grey')
ax.axvline(x=np.mean(std_vs) - np.std(std_vs), ls='--', c='grey')
plt.tight_layout()
vs_log = np.log(vs)
sns.histplot(vs_log)
mu = np.mean(vs_log)
sig = np.std(vs_log)
std_log_vs = [ (v - mu) / sig for v in vs_log ]

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_log_vs, ax=ax, fc='xkcd:sky blue')
ax.axvline(x=np.mean(std_log_vs), ls='--', c='k', lw=2)
ax.axvline(x=np.mean(std_log_vs) + np.std(std_log_vs), ls='--', c='k')
ax.axvline(x=np.mean(std_log_vs) - np.std(std_log_vs), ls='--', c='k')
plt.tight_layout()

run kurtosis to check the shape


kurtosis_fisher = kurtosis(vs, fisher=True)
kurtosis_pearson = kurtosis(vs, fisher=False)

kurtosis_fisher, kurtosis_pearson

kurtosis_fisher = kurtosis(vs_log, fisher=True)
kurtosis_pearson = kurtosis(vs_log, fisher=False)

kurtosis_fisher, kurtosis_pearson

kurtosis_fisher = kurtosis(std_log_vs, fisher=True)
kurtosis_pearson = kurtosis(std_log_vs, fisher=False)

kurtosis_fisher, kurtosis_pearson

fisher: bool, optional
If True, Fisher’s definition is used (normal ==> 0.0).
If False, Pearson’s definition is used (normal ==> 3.0).