Exploratory data analysis (EDA)

import numpy as np
import pandas as pd

Package    Version
---------  ---------
python     3.9.12
numpy      1.21.5
pandas     1.4.2

General

Context manager

If you are working with very long or wide dataframes, it makes sense to set your pandas options such as max_rows and max_columns so that the output from a cell is not miles long.

If you need to temporarily override these settings, you can use the pd.option_context context manager to do so.

df = pd.DataFrame([np.random.randint(1, 100, 100)])
df

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
0	1	46	23	12	1	58	60	52	65	6	...	65	21	4	56	9	32	34	51	64	63

1 rows × 100 columns

with pd.option_context('display.max_rows', 999, 'max_colwidth', 100):
    display(df)

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
0	1	46	23	12	1	58	60	52	65	6	...	65	21	4	56	9	32	34	51	64	63

1 rows × 100 columns

Value counts

By default, srs.value_counts() sorts by frequency descending. If you want the counts back in their default order, chain on .sort_index().

srs = pd.Series(np.random.choice(list('ABCDEF'), size=100))

srs.value_counts()

D    19
C    18
B    17
A    16
F    15
E    15
dtype: int64

srs.value_counts().sort_index()

A    16
B    17
C    18
D    19
E    15
F    15
dtype: int64

Frequency

df = pd.DataFrame({
    'dim_A': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C'],
    'dim_B': ['X', 'Y', 'Z', 'X', 'Y', 'Z', 'X', 'Y'],
    'num_': 1
})

df

	dim_A	dim_B	num_
0	A	X	1
1	A	Y	1
2	A	Z	1
3	B	X	1
4	B	Y	1
5	B	Z	1
6	C	X	1
7	C	Y	1

unstack

You can technically do a simple frequency count using either df.groupby() or df.set_index() in combination with df.unstack(), but this becomes cumbersome for all but the most simple of use-cases.

(
    df
    .groupby(['dim_A', 'dim_B']) 
    ['num_'].count()
    .unstack()
    .fillna(0)
    .astype(int)
)

dim_B	X	Y	Z
dim_A
A	1	1	1
B	1	1	1
C	1	1	0

(
    df
    .set_index(['dim_A', 'dim_B'])
    .iloc[:, 0]
    .unstack()
    .fillna(0)
    .astype(int)
)

dim_B	X	Y	Z
dim_A
A	1	1	1
B	1	1	1
C	1	1	0

pivot

This is, I suppose, a couple lines shorter than using df.unstack(), but does not offer any of the additional options that pd.pivot_table() below does.

pivot_table

This achieves the same result as above, but with just a single line.

df.pivot_table(index='dim_A', columns='dim_B', values='num_', fill_value=0)

dim_B	X	Y	Z
dim_A
A	1	1	1
B	1	1	1
C	1	1	0

And it also has a handful of parameters than come in useful for more complex use-cases.

df.pivot_table(
    index='dim_A',
    columns='dim_B',
    values='num_',
    aggfunc=sum,
    fill_value=0,
    margins=True
).astype(int)

dim_B	X	Y	Z	All
dim_A
A	1	1	1	3
B	1	1	1	3
C	1	1	0	2
All	3	3	2	8

crosstab

pd.crosstab is structurally similar to df.pivot_table, except:

It expects series to be passed as index, columns, values parameters, rather than columm names of a df.
It has a normalize parameter, that is often useful.

dim_B	X	Y	Z	All
A	0.125	0.125	0.125	0.375
B	0.125	0.125	0.125	0.375
C	0.125	0.125	0.000	0.250
All	0.375	0.375	0.250	1.000

Misc

Manually adding a totals row

def add_total_row(df: pd.DataFrame, name='Total') -> pd.DataFrame:
    """ Append a sum row to a DataFrame. Useful when table is final dataviz. """
    total_row = df.sum(numeric_only=True).rename(name).to_frame().T
    return pd.concat([df, total_row])


df = pd.DataFrame({
    'X': np.random.randint(1, 100, size=5),
    'Y': np.random.randint(1, 10, size=5)
}, index=['A', 'B', 'C', 'D', 'E'])

df.pipe(add_total_row)

	X	Y
A	28	1
B	66	8
C	90	2
D	28	6
E	40	3
Total	252	20

df.pipe(add_total_row).style.set_table_styles([{
    'selector': 'tbody > tr:last-child',
    'props': [('border-top', '2px solid #333;')]
}])

	X	Y
A	28	1
B	66	8
C	90	2
D	28	6
E	40	3
Total	252	20

Parsimonious display

Diff rows

df = pd.DataFrame({
    'A': [1, 1, 1],
    'B': [1, 2, 3],
    'C': [1, 1, 2],
    'D': [None, None, None]
})

df

	A	B	C	D
0	1	1	1	None
1	1	2	1	None
2	1	3	2	None

def diff_rows(df: pd.DataFrame) -> pd.DataFrame:
    """ Drop columns for which all values across rows are identical. """
    non_unique_cols = df.astype(str).nunique().loc[lambda x: x > 1].index
    return df[non_unique_cols]

df.pipe(diff_rows)

	B	C
0	1	1
1	2	1
2	3	2

# One-liner  version

df.loc[:, lambda x: x.astype(str).nunique().loc[lambda y: y > 1].index]

	B	C
0	1	1
1	2	1
2	3	2

Drop always-null columns

def drop_all_null_cols(df: pd.DataFrame) -> pd.DataFrame:

    null_cols = df.isnull().mean().loc[lambda x: x == 1].index.tolist()
    null_cols_str = ', '.join(null_cols)
    print(f'Dropping {len(null_cols)} columns that are always null: {null_cols_str}\n')
    return df.drop(null_cols, axis=1)

df.pipe(drop_all_null_cols)

Dropping 1 columns that are always null: D

	A	B	C
0	1	1	1
1	1	2	1
2	1	3	2

Pairwise percentages

Suppose you have two dimensions (A, B) and are interested in the % of some numeric value for which the dimensions match.

df = pd.DataFrame({
    'dim_A': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
    'dim_B': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'num_': np.arange(1, 10)
})

df

	dim_A	dim_B	num_
0	A	A	1
1	A	B	2
2	A	C	3
3	B	A	4
4	B	B	5
5	B	C	6
6	C	A	7
7	C	B	8
8	C	C	9

A good order to handle this is: compute → compare (and filter) → aggregate.

This has the benefit of making the intermediate outputs useful to inspect/debug.

(
    df
    .assign(pct_=lambda x: x['num_'] / x['num_'].sum())
    .loc[lambda x: x['dim_A'] == x['dim_B'], 'pct_']
    .sum()
)

0.33333333333333337

Alternatively, you could: compare → aggregate → filter.

(
    df
    .assign(is_eq=lambda x: x['dim_A'] == x['dim_B'])
    .groupby('is_eq')['num_'].sum()
    .pipe(lambda x: x[True] / x.sum())
)

0.3333333333333333

If you have a series w/ dimensions as MultiIndex, you can combine compare/filter/aggregate steps into the groupby call itself using a lambda function, which receives a tuple representing the index values of each row.

srs = df.set_index(['dim_A', 'dim_B'])['num_']

(
    srs
    .groupby(lambda x: x[0] == x[1]).sum()
    .pipe(lambda x: x[True] / x.sum())
)

0.3333333333333333