The aesthetics of a dataframe are not particularly important—unless they represent a final output that you intend to share with others. It is tempting to copy/paste into a spreadsheet and perform “last mile” visualization tweaks there. But this hurts reproducability—as now your code and output lives in separate places. A better approach is to do as much as possible in-notebook, using pd.DataFrame.style.
Note: You can ignore references to .pipe(hugo_safe_render)
in code snippets below.
Setup
Imports
import numpy as np
import pandas as pd
from IPython.core.display import HTML
Package Version
--------- ---------
python 3.9.13
numpy 1.21.5
pandas 1.4.4
Create example dataframe
df_locations = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charles', 'Denise', 'Eric', 'Fiona'],
'age': np.random.randint(20, 80, size=6),
'origin': ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore'],
'location': ['New York', 'Chicago', 'Chicago', 'Singapore', 'Paris', 'Singapore']
}).rename_axis('user_id')
df_locations
|
name |
age |
origin |
location |
user_id |
|
|
|
|
0 |
Alice |
41 |
New York |
New York |
1 |
Bob |
35 |
Toronto |
Chicago |
2 |
Charles |
50 |
Chicago |
Chicago |
3 |
Denise |
69 |
Berlin |
Singapore |
4 |
Eric |
67 |
Paris |
Paris |
5 |
Fiona |
54 |
Singapore |
Singapore |
General
Title and subtitle
def add_caption_to_df_styler(style_obj, title, subtitle=None):
""" """
caption = title
if subtitle:
caption += f'<div class="subtitle">{subtitle}</div>'
return style_obj.set_caption(caption).set_table_styles([{
'selector': 'caption',
'props': [
('font-family', 'Arial'),
('text-align', 'left'),
('font-size', '24px'),
('padding-bottom', '16px'),
]
}, {
'selector': 'div.subtitle',
'props': [('font-size', '14px'), ('padding-top', '6px')]
}
])
add_caption_to_df_styler(
df_locations.style,
title='Example title',
subtitle='This is an example DataFrame.'
).pipe(hugo_safe_render)
Example titleThis is an example DataFrame.
| name | age | origin | location |
user_id | | | | |
0 | Alice | 41 | New York | New York |
1 | Bob | 35 | Toronto | Chicago |
2 | Charles | 50 | Chicago | Chicago |
3 | Denise | 69 | Berlin | Singapore |
4 | Eric | 67 | Paris | Paris |
5 | Fiona | 54 | Singapore | Singapore |
Monospace font
df_locations.style.set_properties(**{'font-family': 'Menlo'}).pipe(hugo_safe_render)
| name | age | origin | location |
user_id | | | | |
0 | Alice | 41 | New York | New York |
1 | Bob | 35 | Toronto | Chicago |
2 | Charles | 50 | Chicago | Chicago |
3 | Denise | 69 | Berlin | Singapore |
4 | Eric | 67 | Paris | Paris |
5 | Fiona | 54 | Singapore | Singapore |
Background gradient
See also: Choosing Colormaps in Matplotlib — Matplotlib 3.5.1 documentation
For a single column
df_locations.style.background_gradient(subset=['age'], cmap='Reds').pipe(hugo_safe_render)
| name | age | origin | location |
user_id | | | | |
0 | Alice | 41 | New York | New York |
1 | Bob | 35 | Toronto | Chicago |
2 | Charles | 50 | Chicago | Chicago |
3 | Denise | 69 | Berlin | Singapore |
4 | Eric | 67 | Paris | Paris |
5 | Fiona | 54 | Singapore | Singapore |
If applying to multiple columns, the color map for each (min, max) will be separate.
df_multi_gradient = df_locations[['age']].assign(age_x2=lambda x: x['age']*2)
(
df_multi_gradient
.style
.background_gradient(subset=['age', 'age_x2'], cmap='Reds')
.pipe(hugo_safe_render)
)
| age | age_x2 |
user_id | | |
0 | 41 | 82 |
1 | 35 | 70 |
2 | 50 | 100 |
3 | 69 | 138 |
4 | 67 | 134 |
5 | 54 | 108 |
Get around this by using the vmin
and vmax
arguments.
(
df_multi_gradient
.style
.background_gradient(subset=['age', 'age_x2'], cmap='Reds', vmin=0, vmax=100)
.pipe(hugo_safe_render)
)
| age | age_x2 |
user_id | | |
0 | 41 | 82 |
1 | 35 | 70 |
2 | 50 | 100 |
3 | 69 | 138 |
4 | 67 | 134 |
5 | 54 | 108 |
Diverging colormaps
df_colormaps = pd.DataFrame({
'RdYlGn': np.arange(-1, 1.25, 0.25),
'PiYG': np.arange(-1, 1.25, 0.25),
})
display(
df_colormaps
.style
.background_gradient(subset=['RdYlGn'], cmap='RdYlGn', vmin=-1, vmax=1)
.background_gradient(subset=['PiYG'], cmap='PiYG', vmin=-1, vmax=1)
.format('{:.0%}')
.hide(axis='index')
)
RdYlGn |
PiYG |
-100% |
-100% |
-75% |
-75% |
-50% |
-50% |
-25% |
-25% |
0% |
0% |
25% |
25% |
50% |
50% |
75% |
75% |
100% |
100% |
Specific recipes
Bold specific cell values
from functools import partial
def bold_rows(vals: list, x) -> str:
""" Designed to be 'completed' using functools.partial,
then fed into pandas.DataFrame.style.applymap(func) """
if x in vals:
return 'font-weight:bold'
else:
return ''
SOME_VALUES = ['New York', 'Toronto']
bold_some_values = partial(bold_rows, SOME_VALUES)
df_locations.style.applymap(bold_some_values).pipe(hugo_safe_render)
| name | age | origin | location |
user_id | | | | |
0 | Alice | 41 | New York | New York |
1 | Bob | 35 | Toronto | Chicago |
2 | Charles | 50 | Chicago | Chicago |
3 | Denise | 69 | Berlin | Singapore |
4 | Eric | 67 | Paris | Paris |
5 | Fiona | 54 | Singapore | Singapore |
Highlight differences between two columns
from typing import List
def highlight_diff_styler(row: pd.Series, color='yellow') -> List[str]:
""" The function to use in df.style.apply() """
attr = 'background-color: {}'.format(color)
is_diff = row.iloc[0] != row.iloc[1]
if is_diff:
return [attr, attr]
else:
return ['', '']
df_locations.style.apply(highlight_diff_styler, axis=1, subset=['origin', 'location']).pipe(hugo_safe_render)
| name | age | origin | location |
user_id | | | | |
0 | Alice | 41 | New York | New York |
1 | Bob | 35 | Toronto | Chicago |
2 | Charles | 50 | Chicago | Chicago |
3 | Denise | 69 | Berlin | Singapore |
4 | Eric | 67 | Paris | Paris |
5 | Fiona | 54 | Singapore | Singapore |
Multi-line column names
Pandas does not support line breaks in column names, as this is generally considered an anti-pattern. But you can simulate them by creating a hierarchical index.
tuples = [
('Name', ''),
('Age', ''),
('Location', '(original)'),
('Location', '(current)')
]
idx = pd.MultiIndex.from_tuples(tuples)
to_viz = df_locations.copy()
to_viz.columns = idx
with pd.option_context('display.multi_sparse', False):
display(to_viz)
|
Name |
Age |
Location |
Location |
|
|
|
(original) |
(current) |
user_id |
|
|
|
|
0 |
Alice |
41 |
New York |
New York |
1 |
Bob |
35 |
Toronto |
Chicago |
2 |
Charles |
50 |
Chicago |
Chicago |
3 |
Denise |
69 |
Berlin |
Singapore |
4 |
Eric |
67 |
Paris |
Paris |
5 |
Fiona |
54 |
Singapore |
Singapore |
If you’re planning to ultimately paste the dataframe as text, you can use the col_space
argument of df.to_string()
to manually resize the columns, so that it is easy to read.
print(to_viz.to_string(sparsify=False, col_space=[0, 5, 12, 12]))
Name Age Location Location
(original) (current)
user_id
0 Alice 41 New York New York
1 Bob 35 Toronto Chicago
2 Charles 50 Chicago Chicago
3 Denise 69 Berlin Singapore
4 Eric 67 Paris Paris
5 Fiona 54 Singapore Singapore
Table styles
Append a totals row
Note: If using pandas >= 1.5.0, a perhaps more elegant approach would be to use df.style.concat(df.agg(["sum"]).style)
.
def add_total_row(df: pd.DataFrame, name='Total') -> pd.DataFrame:
""" Append a sum row to a DataFrame. Useful when table is final dataviz. """
total_row = df.sum(numeric_only=True).rename(name).to_frame().T
return pd.concat([df, total_row])
df = pd.DataFrame({
'X': np.random.randint(1, 20, size=5),
'Y': np.random.randint(1, 50, size=5),
'Z': np.random.randint(1, 100, size=5)
}, index=['A', 'B', 'C', 'D', 'E'])
df.pipe(add_total_row).style.set_table_styles([{
'selector': 'tbody > tr:last-child',
'props': [('border-top', '2px solid #333;')]
}])
def format_helper(
df: pd.DataFrame,
int_cols: list = None,
pct_cols: list = None,
infer_cols: bool = True,
delta_cols: list= None,
monospace: bool = True,
hide_index: bool = True
) -> None:
""" Apply common formatting using pandas.DataFrame.style methods """
if isinstance(df, pd.core.frame.DataFrame):
output = df.style
elif isinstance(df, pd.io.formats.style.Styler):
output = df
else:
raise ValueError
if monospace:
output = output.set_properties(**{'font-family': 'Menlo'})
formats = {
'int_cols': '{:.0f}',
'pct_cols': '{:.2%}',
'delta_cols': '{:+.2%}'
}
if infer_cols and not int_cols and not pct_cols:
int_cols = [c for c in df.columns if (str.startswith(c, 'n_') or str.startswith(c, 'num_'))]
pct_cols = [c for c in df.columns if (str.startswith(c, 'p_') or str.startswith(c, 'pct_'))]
if int_cols:
output = output.format({col: formats['int_cols'] for col in int_cols})
if pct_cols:
output = output.format({col: formats['pct_cols'] for col in pct_cols})
if delta_cols:
output = output.format({col: formats['delta_cols'] for col in delta_cols})
if hide_index:
output = output.hide_index()
display(output)
Further reading
comments powered by Disqus