Use mapclassify to create breaks and groups of values (for choropleth map)¶
import libraries and functions¶
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mapclassify as mc
basic path joining and loading tabulate data¶
data_dir = '../datasets/data_dirs/' # please change to the file location in your directories
os.listdir(data_dir)fname = 'hdb_data_2023.csv.xz' # filename of the file to be red
fp = os.path.join(data_dir, fname) # join the filename with the location of file
print(fp) # print to check
df = pd.read_csv(fp, index_col=0)
df.head() # show the top n rows, n=5 by defaultvs = df['resale_price'].to_list()sns.histplot(x='resale_price', data=df, )breaks = mc.EqualInterval(vs)
breaksbreaks.ybbreaks.binsnp.histogram(vs, bins=list(breaks.bins))np.histogram(vs, bins=[150000] + [x for x in list(breaks.bins)])len(vs)sum([4967, 15233, 4806, 673, 81]) ,sum([5152, 15182, 4713, 634, 79])breaks = mc.EqualInterval(vs)
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)
ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()breaks = mc.Quantiles(vs)
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)
ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()breaks = mc.NaturalBreaks(vs)
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)
ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()breaks = mc.StdMean(vs)
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)
ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()breaks = mc.HeadTailBreaks(vs)
fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)
ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()df['htbreaks_yb'] = breaks.ybdf.head()for g in df['htbreaks_yb'].unique():
tmp = df[df['htbreaks_yb']==g]
print(g, len(tmp))
sns.boxplot(x='htbreaks_yb', y='resale_price', data=df)