Geoff Ruddock

Visual styling a pandas DataFrame

The aesthetics of a dataframe are not particularly important—unless they represent a final output that you intend to share with others. It is tempting to copy/paste into a spreadsheet and perform “last mile” visualization tweaks there. But this hurts reproducability—as now your code and output lives in separate places. A better approach is to do as much as possible in-notebook, using pd.DataFrame.style.

Note: You can ignore references to .pipe(hugo_safe_render) in code snippets below.

Setup

Imports

import numpy as np
import pandas as pd

from IPython.core.display import HTML
Package    Version
---------  ---------
python     3.9.13
numpy      1.21.5
pandas     1.4.4

Create example dataframe

df_locations = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charles', 'Denise', 'Eric', 'Fiona'],
    'age': np.random.randint(20, 80, size=6),
    'origin': ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore'],
    'location': ['New York', 'Chicago', 'Chicago', 'Singapore', 'Paris', 'Singapore']
}).rename_axis('user_id')

df_locations

name age origin location
user_id
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

General

Title and subtitle

def add_caption_to_df_styler(style_obj, title, subtitle=None):
    """ """
    caption = title
    if subtitle:
        caption += f'<div class="subtitle">{subtitle}</div>'

    return style_obj.set_caption(caption).set_table_styles([{
        'selector': 'caption',
        'props': [
            ('font-family', 'Arial'),
            ('text-align', 'left'),
            ('font-size', '24px'),
            ('padding-bottom', '16px'),
        ]
        }, {
            'selector': 'div.subtitle',
            'props': [('font-size', '14px'), ('padding-top', '6px')]
        }
    ])

add_caption_to_df_styler(
    df_locations.style,
    title='Example title',
    subtitle='This is an example DataFrame.'
).pipe(hugo_safe_render)
Example title
This is an example DataFrame.
  name age origin location
user_id        
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

Monospace font

df_locations.style.set_properties(**{'font-family': 'Menlo'}).pipe(hugo_safe_render)
  name age origin location
user_id        
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

Background gradient

See also: Choosing Colormaps in Matplotlib — Matplotlib 3.5.1 documentation

For a single column

df_locations.style.background_gradient(subset=['age'], cmap='Reds').pipe(hugo_safe_render)
  name age origin location
user_id        
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

If applying to multiple columns, the color map for each (min, max) will be separate.

df_multi_gradient = df_locations[['age']].assign(age_x2=lambda x: x['age']*2)

(
    df_multi_gradient
    .style
    .background_gradient(subset=['age', 'age_x2'], cmap='Reds')
    .pipe(hugo_safe_render)
)
  age age_x2
user_id    
0 41 82
1 35 70
2 50 100
3 69 138
4 67 134
5 54 108

Get around this by using the vmin and vmax arguments.

(
    df_multi_gradient
    .style
    .background_gradient(subset=['age', 'age_x2'], cmap='Reds', vmin=0, vmax=100)
    .pipe(hugo_safe_render)
)
  age age_x2
user_id    
0 41 82
1 35 70
2 50 100
3 69 138
4 67 134
5 54 108

Diverging colormaps

df_colormaps = pd.DataFrame({
    'RdYlGn': np.arange(-1, 1.25, 0.25),
    'PiYG': np.arange(-1, 1.25, 0.25),
})

display(
    df_colormaps
    .style
    .background_gradient(subset=['RdYlGn'], cmap='RdYlGn', vmin=-1, vmax=1)
    .background_gradient(subset=['PiYG'], cmap='PiYG', vmin=-1, vmax=1)
    .format('{:.0%}')
    .hide(axis='index')
)
RdYlGn PiYG
-100% -100%
-75% -75%
-50% -50%
-25% -25%
0% 0%
25% 25%
50% 50%
75% 75%
100% 100%

Specific recipes

Bold specific cell values

from functools import partial

def bold_rows(vals: list, x) -> str:
    """ Designed to be 'completed' using functools.partial,
    then fed into pandas.DataFrame.style.applymap(func) """
    
    if x in vals:
        return 'font-weight:bold'
    else:
        return ''

SOME_VALUES = ['New York', 'Toronto']

bold_some_values = partial(bold_rows, SOME_VALUES)

df_locations.style.applymap(bold_some_values).pipe(hugo_safe_render)
  name age origin location
user_id        
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

Highlight differences between two columns

from typing import List

def highlight_diff_styler(row: pd.Series, color='yellow') -> List[str]:
    """ The function to use in df.style.apply() """

    attr = 'background-color: {}'.format(color)
    is_diff = row.iloc[0] != row.iloc[1]
    if is_diff:
        return [attr, attr]
    else:
        return ['', '']
    

df_locations.style.apply(highlight_diff_styler, axis=1, subset=['origin', 'location']).pipe(hugo_safe_render)
  name age origin location
user_id        
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

Multi-line column names

Pandas does not support line breaks in column names, as this is generally considered an anti-pattern. But you can simulate them by creating a hierarchical index.

tuples = [
    ('Name', ''),
    ('Age', ''),
    ('Location', '(original)'),
    ('Location', '(current)')
]

idx = pd.MultiIndex.from_tuples(tuples)
to_viz = df_locations.copy()
to_viz.columns = idx

with pd.option_context('display.multi_sparse', False):
    display(to_viz)

Name Age Location Location
(original) (current)
user_id
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore

If you’re planning to ultimately paste the dataframe as text, you can use the col_space argument of df.to_string() to manually resize the columns, so that it is easy to read.

print(to_viz.to_string(sparsify=False, col_space=[0, 5, 12, 12]))
            Name   Age     Location     Location
                         (original)    (current)
user_id                                         
0          Alice    41     New York     New York
1            Bob    35      Toronto      Chicago
2        Charles    50      Chicago      Chicago
3         Denise    69       Berlin    Singapore
4           Eric    67        Paris        Paris
5          Fiona    54    Singapore    Singapore

Table styles

Append a totals row

Note: If using pandas >= 1.5.0, a perhaps more elegant approach would be to use df.style.concat(df.agg(["sum"]).style).

def add_total_row(df: pd.DataFrame, name='Total') -> pd.DataFrame:
    """ Append a sum row to a DataFrame. Useful when table is final dataviz. """
    total_row = df.sum(numeric_only=True).rename(name).to_frame().T
    return pd.concat([df, total_row])


df = pd.DataFrame({
    'X': np.random.randint(1, 20, size=5),
    'Y': np.random.randint(1, 50, size=5),
    'Z': np.random.randint(1, 100, size=5)
}, index=['A', 'B', 'C', 'D', 'E'])

df.pipe(add_total_row).style.set_table_styles([{
    'selector': 'tbody > tr:last-child',
    'props': [('border-top', '2px solid #333;')]
}])

Single boilerplate format_helper function

def format_helper(
        df: pd.DataFrame,
        int_cols: list = None,
        pct_cols: list = None,
        infer_cols: bool = True,
        delta_cols: list= None,
        monospace: bool = True,
        hide_index: bool = True
    ) -> None:
    """ Apply common formatting using pandas.DataFrame.style methods """
    
    if isinstance(df, pd.core.frame.DataFrame):
        output = df.style
    elif isinstance(df, pd.io.formats.style.Styler):
        output = df
    else:
        raise ValueError
    
    if monospace:
        output = output.set_properties(**{'font-family': 'Menlo'})
    
    formats = {
        'int_cols': '{:.0f}',
        'pct_cols': '{:.2%}',
        'delta_cols': '{:+.2%}'
    }
    
    if infer_cols and not int_cols and not pct_cols:
        int_cols = [c for c in df.columns if (str.startswith(c, 'n_') or str.startswith(c, 'num_'))]
        pct_cols = [c for c in df.columns if (str.startswith(c, 'p_') or str.startswith(c, 'pct_'))]
        
    if int_cols:
        output = output.format({col: formats['int_cols'] for col in int_cols})
        
    if pct_cols:
        output = output.format({col: formats['pct_cols'] for col in pct_cols})
        
    if delta_cols:
        output = output.format({col: formats['delta_cols'] for col in delta_cols})
        
    if hide_index:
        output = output.hide_index()
        
    display(output)

Further reading


comments powered by Disqus