Skip to article frontmatterSkip to article content

Use mapclassify to create breaks and groups of values (for choropleth map)

import libraries and functions

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mapclassify as mc

basic path joining and loading tabulate data

data_dir = '../datasets/data_dirs/'  # please change to the file location in your directories
os.listdir(data_dir)
fname = 'hdb_data_2023.csv.xz'  # filename of the file to be red
fp = os.path.join(data_dir, fname)  # join the filename with the location of file
print(fp)  # print to check

df = pd.read_csv(fp, index_col=0)
df.head()  # show the top n rows, n=5 by default
vs = df['resale_price'].to_list()
sns.histplot(x='resale_price', data=df, )
breaks = mc.EqualInterval(vs)
breaks
breaks.yb
breaks.bins
np.histogram(vs, bins=list(breaks.bins))
np.histogram(vs, bins=[150000] + [x for x in list(breaks.bins)])
len(vs)
sum([4967, 15233,  4806,   673,    81]) ,sum([5152, 15182,  4713,   634,    79])
breaks = mc.EqualInterval(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')

plt.tight_layout()
breaks = mc.Quantiles(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()
breaks = mc.NaturalBreaks(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()
breaks = mc.StdMean(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()
breaks = mc.HeadTailBreaks(vs)

fig, ax = plt.subplots(figsize=(6,4))
sns.histplot(vs)

ax.axvline(x=min(vs), c='k', ls='--')
for cut in breaks.bins:
    ax.axvline(x=cut, c='k', ls='--')
plt.tight_layout()
df['htbreaks_yb'] = breaks.yb
df.head()
for g in df['htbreaks_yb'].unique():
    tmp = df[df['htbreaks_yb']==g]
    print(g, len(tmp))
sns.boxplot(x='htbreaks_yb', y='resale_price', data=df)