Geoff Ruddock

Binomial proportions

Goal: Create a visual that makes it easy to digest/compare segmented percentages (binomial proportions) while accounting for uncertainty of small sample sizes.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
Package      Version
-----------  ---------
python       3.9.12
numpy        1.21.5
pandas       1.4.2
matplotlib   3.5.1
statsmodels  0.13.2

Example data

np.random.seed(24)

example_data = (
    pd.DataFrame({
        'metric': ['Received rate', 'Open rate', 'Click rate', 'Purchase rate'] * 3,
        'dimension': ['Group A'] * 4 + ['Group B'] * 4 + ['Group C'] * 4,
        'k': (np.random.randint(25, 75, size=12)),
        'n': 100
    })
    .set_index(['metric', 'dimension'])
    .sort_index()
)

example_data

kn
metricdimension
Click rateGroup A25100
Group B26100
Group C72100
Open rateGroup A28100
Group B42100
Group C29100
Purchase rateGroup A48100
Group B61100
Group C60100
Received rateGroup A59100
Group B42100
Group C40100

Single plot

import matplotlib.ticker as mticker

def plot_pct_with_conf_ints(df: pd.DataFrame, alpha=0.05, ax=None):
    """ Plot multiple binomial proportions (percentages) with error bars to represent the confidence intervals. """

    assert df.columns.tolist() == ['k', 'n']
    df = df.copy()
    df = df[::-1]  # reverse so that vertical order matches input df
    n_rows = df.shape[0]

    #################
    # CONFIGURE PLOT
    #################
    
    plt.style.use('classic')
    plt.style.use('default')

    plot_height = n_rows * 0.4

    if ax is None:
        _, ax = plt.subplots(figsize=(8, plot_height), dpi=100)

    ############
    # PLOT DATA
    ############
    
    x = (df['k'] / df['n']).values
    y = np.arange(n_rows)
    ax.scatter(x, y, marker='D')

    #################
    # PLOT INTERVALS
    #################

    abs_conf_ints = sm.stats.proportion_confint(df['k'].values, df['n'].values, alpha)
    rel_conf_ints = np.abs(abs_conf_ints-x)  # errorbar func expects relative differences (±)
    ax.errorbar(
        x=x, y=y,             # position of points
        xerr=rel_conf_ints,   # size (±) of error bar
        capsize=5,            # wing tips
        ls='none'             # prevent connecting points
    )

    ##################
    # AXIS FORMATTING
    ##################
    
    ax.set_xlim(0, 1.01)
    ax.set_ylim(-1, n_rows)
    
    ax.xaxis.grid(True, alpha=0.4)
    
    ax.set_yticks(np.arange(n_rows))
    ax.set_yticklabels(df.index.values)
    
    # Format x-ticks as percentages
    
    ticks_loc = ax.get_xticks().tolist()
    ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
    ax.set_xticklabels(['{:.0%}'.format(x) for x in ax.get_xticks()])
    
    return ax


_ = plot_pct_with_conf_ints(example_data.loc['Open rate'])

png

Combined plot

def combined_plot(df: pd.DataFrame, title:str = None, hspace: float = 0.5):
    """ Combine multiple binomial plots into a single matplotlib figure. """
    
    n_subplots = len(df.index.levels[0])
    height_ratios = df.groupby(level=0).count().iloc[:, 0].values.tolist()
    
    fig, axes = plt.subplots(
        nrows=n_subplots,
        ncols=1,
        gridspec_kw={'height_ratios': height_ratios, 'hspace': hspace},
        figsize=(8, 8), dpi=100
    )
    
    for i, (dim, sub_df) in enumerate(df.groupby(level=0)):
        axes[i].set_title(dim, size='medium')
        sub_df = df.loc[dim]
        _ = plot_pct_with_conf_ints(sub_df, ax=axes[i])

    plt.suptitle(title, y=0.94)
    
combined_plot(example_data, title='Combined plot')

png