Goal: Create a visual that makes it easy to digest/compare segmented percentages (binomial proportions) while accounting for uncertainty of small sample sizes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
Package Version
----------- ---------
python 3.9.12
numpy 1.21.5
pandas 1.4.2
matplotlib 3.5.1
statsmodels 0.13.2
Example data
np.random.seed(24)
example_data = (
pd.DataFrame({
'metric': ['Received rate', 'Open rate', 'Click rate', 'Purchase rate'] * 3,
'dimension': ['Group A'] * 4 + ['Group B'] * 4 + ['Group C'] * 4,
'k': (np.random.randint(25, 75, size=12)),
'n': 100
})
.set_index(['metric', 'dimension'])
.sort_index()
)
example_data
k | n | ||
---|---|---|---|
metric | dimension | ||
Click rate | Group A | 25 | 100 |
Group B | 26 | 100 | |
Group C | 72 | 100 | |
Open rate | Group A | 28 | 100 |
Group B | 42 | 100 | |
Group C | 29 | 100 | |
Purchase rate | Group A | 48 | 100 |
Group B | 61 | 100 | |
Group C | 60 | 100 | |
Received rate | Group A | 59 | 100 |
Group B | 42 | 100 | |
Group C | 40 | 100 |
Single plot
import matplotlib.ticker as mticker
def plot_pct_with_conf_ints(df: pd.DataFrame, alpha=0.05, ax=None):
""" Plot multiple binomial proportions (percentages) with error bars to represent the confidence intervals. """
assert df.columns.tolist() == ['k', 'n']
df = df.copy()
df = df[::-1] # reverse so that vertical order matches input df
n_rows = df.shape[0]
#################
# CONFIGURE PLOT
#################
plt.style.use('classic')
plt.style.use('default')
plot_height = n_rows * 0.4
if ax is None:
_, ax = plt.subplots(figsize=(8, plot_height), dpi=100)
############
# PLOT DATA
############
x = (df['k'] / df['n']).values
y = np.arange(n_rows)
ax.scatter(x, y, marker='D')
#################
# PLOT INTERVALS
#################
abs_conf_ints = sm.stats.proportion_confint(df['k'].values, df['n'].values, alpha)
rel_conf_ints = np.abs(abs_conf_ints-x) # errorbar func expects relative differences (±)
ax.errorbar(
x=x, y=y, # position of points
xerr=rel_conf_ints, # size (±) of error bar
capsize=5, # wing tips
ls='none' # prevent connecting points
)
##################
# AXIS FORMATTING
##################
ax.set_xlim(0, 1.01)
ax.set_ylim(-1, n_rows)
ax.xaxis.grid(True, alpha=0.4)
ax.set_yticks(np.arange(n_rows))
ax.set_yticklabels(df.index.values)
# Format x-ticks as percentages
ticks_loc = ax.get_xticks().tolist()
ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
ax.set_xticklabels(['{:.0%}'.format(x) for x in ax.get_xticks()])
return ax
_ = plot_pct_with_conf_ints(example_data.loc['Open rate'])
Combined plot
def combined_plot(df: pd.DataFrame, title:str = None, hspace: float = 0.5):
""" Combine multiple binomial plots into a single matplotlib figure. """
n_subplots = len(df.index.levels[0])
height_ratios = df.groupby(level=0).count().iloc[:, 0].values.tolist()
fig, axes = plt.subplots(
nrows=n_subplots,
ncols=1,
gridspec_kw={'height_ratios': height_ratios, 'hspace': hspace},
figsize=(8, 8), dpi=100
)
for i, (dim, sub_df) in enumerate(df.groupby(level=0)):
axes[i].set_title(dim, size='medium')
sub_df = df.loc[dim]
_ = plot_pct_with_conf_ints(sub_df, ax=axes[i])
plt.suptitle(title, y=0.94)
combined_plot(example_data, title='Combined plot')
comments powered by Disqus