Basic processing for data: Reading table and generating histogram plots¶
import libraries and functions¶
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis
basic path joining and loading tabulate data¶
data_dir = '../datasets/data_dirs/' # please change to the file location in your directories
os.listdir(data_dir)fname = 'hdb_data_2023.csv.xz' # filename of the file to be red
fp = os.path.join(data_dir, fname) # join the filename with the location of file
print(fp) # print to check
df = pd.read_csv(fp, index_col=0)
df.head() # show the top n rows, n=5 by defaultsome note on making plots¶
fig, axg = plt.subplots(2, 3) # to create 2 rows, 3 columns subplots, plt.subplots( # row, # column )fig, axg = plt.subplots(2, 3) # to create 2 rows, 3 columns subplots
plt.tight_layout() # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each otheraxgaxs = axg.flatten()
axsaxses are the subplots, i.e., how we can point and identify the separate subplot spaces;
in above example, axg is an array with two sub-arrays, wach sub-array contains 3 axes. these arrays correspond to the rows (from top to bottom).
the .flatten() method will convert the multiple sub arrays into a single array, which goes right, then next row, right, next row... goes like a Z steps.
See the two examples below: for axg and the flatten axs
fig, axg = plt.subplots(2, 3) # to create 2 rows, 3 columns subplots
labels = 'abc'
for i, axs in enumerate(axg):
for j, ax in enumerate(axs):
ax.set_title('row {}, col {}'.format(i, labels[j]))
plt.tight_layout() # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each otherfig, axg = plt.subplots(2, 3) # to create 2 rows, 3 columns subplots
axs = axg.flatten() # check this
labels = 'abcdef'
for i, ax in enumerate(axs):
ax.set_title(labels[i])
plt.tight_layout() # automatically arrange to make the subplots 'tight': filling the figure space and not overlapping each othergenerating distribution plots¶
sns.boxplot(x='storey_range', y='resale_price', data=df)df['storey_range'].unique()fig, ax = plt.subplots(figsize=(10,4)) # figsize (width, height)
sns.boxplot(x='storey_range', y='resale_age',
color='pink', order=sorted(df['storey_range'].unique()),
data=df, ax=ax)
xs = ax.get_xticklabels()
print(xs)
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale age against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale age')
#
plt.tight_layout()clrs = sns.color_palette('colorblind')
clrsfig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='storey_range', y='resale_price',
color=clrs[1], order=sorted(df['storey_range'].unique()),
data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against storey range')
ax.set_xlabel('Storey range')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(x='resale_age', y='resale_price',
color='pink', order=sorted(df['resale_age'].unique()),
data=df, ax=ax)
xs = ax.get_xticklabels()
xs = [x.get_text() for x in xs]
ax.set_xticks(xs)
ax.set_xticklabels(xs, rotation=55)
ax.set_title('Resale price against age')
ax.set_xlabel('Resale age')
ax.set_ylabel('Resale price')
#print(xs)
plt.tight_layout()sns.histplot(x='resale_age', data=df)sns.kdeplot(x='resale_age', data=df)sns.histplot(x=df['resale_age'])sns.histplot(x='resale_age', data=df, bins=[x*5 for x in range(20)])list(range(20)) # from zero to the integer before the number here (12)list(range(5, 12)) # from the first number (innclusive, 5) to the integer before the last number here (12)[x*5 for x in range(13)] # list comprehension in pythona_list = [] # list()
for x in range(13):
a_list.append(x*5)
print(a_list)np.histogram(df['resale_age'])np.histogram(df['resale_age'], bins=[x*5 for x in range(20)])data transformation¶
sns.histplot(x='resale_price', data=df, )vs = df['resale_price'].to_list()
vmin = min(vs)
vmax = max(vs)
min_max_vs = [ (v-vmin) / (vmax - vmin) for v in vs ]
print(min(min_max_vs), max(min_max_vs))
sns.histplot(min_max_vs)
min_max_vs2 = [ (v-vmin) / (vmax - vmin) * (100 - 1) + 1 for v in vs ]
print(min(min_max_vs2), max(min_max_vs2))
sns.histplot(min_max_vs2)mu = np.mean(vs)
sig = np.std(vs)
std_vs = [ (v - mu) / sig for v in vs ]
print(min(std_vs), max(std_vs))
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_vs, ax=ax)
ax.axvline(x=np.mean(std_vs), ls=':', c='k')
ax.axvline(x=np.mean(std_vs) + np.std(std_vs), ls='--', c='grey')
ax.axvline(x=np.mean(std_vs) - np.std(std_vs), ls='--', c='grey')
plt.tight_layout()vs_log = np.log(vs)
sns.histplot(vs_log)mu = np.mean(vs_log)
sig = np.std(vs_log)
std_log_vs = [ (v - mu) / sig for v in vs_log ]
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(std_log_vs, ax=ax, fc='xkcd:sky blue')
ax.axvline(x=np.mean(std_log_vs), ls='--', c='k', lw=2)
ax.axvline(x=np.mean(std_log_vs) + np.std(std_log_vs), ls='--', c='k')
ax.axvline(x=np.mean(std_log_vs) - np.std(std_log_vs), ls='--', c='k')
plt.tight_layout()run kurtosis to check the shape¶
kurtosis_fisher = kurtosis(vs, fisher=True)
kurtosis_pearson = kurtosis(vs, fisher=False)
kurtosis_fisher, kurtosis_pearson
kurtosis_fisher = kurtosis(vs_log, fisher=True)
kurtosis_pearson = kurtosis(vs_log, fisher=False)
kurtosis_fisher, kurtosis_pearson
kurtosis_fisher = kurtosis(std_log_vs, fisher=True)
kurtosis_pearson = kurtosis(std_log_vs, fisher=False)
kurtosis_fisher, kurtosis_pearsonfisher: bool, optional
If True, Fisher’s definition is used (normal ==> 0.0).
If False, Pearson’s definition is used (normal ==> 3.0).