The aesthetics of a dataframe are not particularly important—unless they represent a final output that you intend to share with others. It is tempting to copy/paste into a spreadsheet and perform “last mile” visualization tweaks there. But this hurts reproducability—as now your code and output lives in separate places. A better approach is to do as much as possible in-notebook, using pd.DataFrame.style.
Note: You can ignore references to .pipe(hugo_safe_render) in code snippets below.
Setup
Imports
import numpy as np
import pandas as pd
from IPython.core.display import HTML
Package Version
--------- ---------
python 3.9.13
numpy 1.21.5
pandas 1.4.4
Create example dataframe
df_locations = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charles', 'Denise', 'Eric', 'Fiona'],
'age': np.random.randint(20, 80, size=6),
'origin': ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore'],
'location': ['New York', 'Chicago', 'Chicago', 'Singapore', 'Paris', 'Singapore']
}).rename_axis('user_id')
df_locations
| name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
General
Title and subtitle
def add_caption_to_df_styler(style_obj, title, subtitle=None):
""" """
caption = title
if subtitle:
caption += f'<div class="subtitle">{subtitle}</div>'
return style_obj.set_caption(caption).set_table_styles([{
'selector': 'caption',
'props': [
('font-family', 'Arial'),
('text-align', 'left'),
('font-size', '24px'),
('padding-bottom', '16px'),
]
}, {
'selector': 'div.subtitle',
'props': [('font-size', '14px'), ('padding-top', '6px')]
}
])
add_caption_to_df_styler(
df_locations.style,
title='Example title',
subtitle='This is an example DataFrame.'
).pipe(hugo_safe_render)
Example titleThis is an example DataFrame.
| | name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
Monospace font
df_locations.style.set_properties(**{'font-family': 'Menlo'}).pipe(hugo_safe_render)
| | name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
Background gradient
See also: Choosing Colormaps in Matplotlib — Matplotlib 3.5.1 documentation
For a single column
df_locations.style.background_gradient(subset=['age'], cmap='Reds').pipe(hugo_safe_render)
| | name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
If applying to multiple columns, the color map for each (min, max) will be separate.
df_multi_gradient = df_locations[['age']].assign(age_x2=lambda x: x['age']*2)
(
df_multi_gradient
.style
.background_gradient(subset=['age', 'age_x2'], cmap='Reds')
.pipe(hugo_safe_render)
)
| | age | age_x2 |
|---|
| user_id | | |
|---|
| 0 | 41 | 82 |
|---|
| 1 | 35 | 70 |
|---|
| 2 | 50 | 100 |
|---|
| 3 | 69 | 138 |
|---|
| 4 | 67 | 134 |
|---|
| 5 | 54 | 108 |
|---|
Get around this by using the vmin and vmax arguments.
(
df_multi_gradient
.style
.background_gradient(subset=['age', 'age_x2'], cmap='Reds', vmin=0, vmax=100)
.pipe(hugo_safe_render)
)
| | age | age_x2 |
|---|
| user_id | | |
|---|
| 0 | 41 | 82 |
|---|
| 1 | 35 | 70 |
|---|
| 2 | 50 | 100 |
|---|
| 3 | 69 | 138 |
|---|
| 4 | 67 | 134 |
|---|
| 5 | 54 | 108 |
|---|
Diverging colormaps
df_colormaps = pd.DataFrame({
'RdYlGn': np.arange(-1, 1.25, 0.25),
'PiYG': np.arange(-1, 1.25, 0.25),
})
display(
df_colormaps
.style
.background_gradient(subset=['RdYlGn'], cmap='RdYlGn', vmin=-1, vmax=1)
.background_gradient(subset=['PiYG'], cmap='PiYG', vmin=-1, vmax=1)
.format('{:.0%}')
.hide(axis='index')
)
| RdYlGn | PiYG |
|---|
| -100% | -100% |
| -75% | -75% |
| -50% | -50% |
| -25% | -25% |
| 0% | 0% |
| 25% | 25% |
| 50% | 50% |
| 75% | 75% |
| 100% | 100% |
Specific recipes
Bold specific cell values
from functools import partial
def bold_rows(vals: list, x) -> str:
""" Designed to be 'completed' using functools.partial,
then fed into pandas.DataFrame.style.applymap(func) """
if x in vals:
return 'font-weight:bold'
else:
return ''
SOME_VALUES = ['New York', 'Toronto']
bold_some_values = partial(bold_rows, SOME_VALUES)
df_locations.style.applymap(bold_some_values).pipe(hugo_safe_render)
| | name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
Highlight differences between two columns
from typing import List
def highlight_diff_styler(row: pd.Series, color='yellow') -> List[str]:
""" The function to use in df.style.apply() """
attr = 'background-color: {}'.format(color)
is_diff = row.iloc[0] != row.iloc[1]
if is_diff:
return [attr, attr]
else:
return ['', '']
df_locations.style.apply(highlight_diff_styler, axis=1, subset=['origin', 'location']).pipe(hugo_safe_render)
| | name | age | origin | location |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
Multi-line column names
Pandas does not support line breaks in column names, as this is generally considered an anti-pattern. But you can simulate them by creating a hierarchical index.
tuples = [
('Name', ''),
('Age', ''),
('Location', '(original)'),
('Location', '(current)')
]
idx = pd.MultiIndex.from_tuples(tuples)
to_viz = df_locations.copy()
to_viz.columns = idx
with pd.option_context('display.multi_sparse', False):
display(to_viz)
| Name | Age | Location | Location |
|---|
| | | (original) | (current) |
|---|
| user_id | | | | |
|---|
| 0 | Alice | 41 | New York | New York |
|---|
| 1 | Bob | 35 | Toronto | Chicago |
|---|
| 2 | Charles | 50 | Chicago | Chicago |
|---|
| 3 | Denise | 69 | Berlin | Singapore |
|---|
| 4 | Eric | 67 | Paris | Paris |
|---|
| 5 | Fiona | 54 | Singapore | Singapore |
|---|
If you’re planning to ultimately paste the dataframe as text, you can use the col_space argument of df.to_string() to manually resize the columns, so that it is easy to read.
print(to_viz.to_string(sparsify=False, col_space=[0, 5, 12, 12]))
Name Age Location Location
(original) (current)
user_id
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore
Table styles
Append a totals row
Note: If using pandas >= 1.5.0, a perhaps more elegant approach would be to use df.style.concat(df.agg(["sum"]).style).
def add_total_row(df: pd.DataFrame, name='Total') -> pd.DataFrame:
""" Append a sum row to a DataFrame. Useful when table is final dataviz. """
total_row = df.sum(numeric_only=True).rename(name).to_frame().T
return pd.concat([df, total_row])
df = pd.DataFrame({
'X': np.random.randint(1, 20, size=5),
'Y': np.random.randint(1, 50, size=5),
'Z': np.random.randint(1, 100, size=5)
}, index=['A', 'B', 'C', 'D', 'E'])
df.pipe(add_total_row).style.set_table_styles([{
'selector': 'tbody > tr:last-child',
'props': [('border-top', '2px solid #333;')]
}])
def format_helper(
df: pd.DataFrame,
int_cols: list = None,
pct_cols: list = None,
infer_cols: bool = True,
delta_cols: list= None,
monospace: bool = True,
hide_index: bool = True
) -> None:
""" Apply common formatting using pandas.DataFrame.style methods """
if isinstance(df, pd.core.frame.DataFrame):
output = df.style
elif isinstance(df, pd.io.formats.style.Styler):
output = df
else:
raise ValueError
if monospace:
output = output.set_properties(**{'font-family': 'Menlo'})
formats = {
'int_cols': '{:.0f}',
'pct_cols': '{:.2%}',
'delta_cols': '{:+.2%}'
}
if infer_cols and not int_cols and not pct_cols:
int_cols = [c for c in df.columns if (str.startswith(c, 'n_') or str.startswith(c, 'num_'))]
pct_cols = [c for c in df.columns if (str.startswith(c, 'p_') or str.startswith(c, 'pct_'))]
if int_cols:
output = output.format({col: formats['int_cols'] for col in int_cols})
if pct_cols:
output = output.format({col: formats['pct_cols'] for col in pct_cols})
if delta_cols:
output = output.format({col: formats['delta_cols'] for col in delta_cols})
if hide_index:
output = output.hide_index()
display(output)
Further reading