Data wrangling

import numpy as np
import pandas as pd

Package    Version
---------  ---------
python     3.9.12
numpy      1.21.5
pandas     1.4.2

Data types

Inferring dtypes automatically

Let’s make a toy df that contains a variety of different dtypes.

df = pd.DataFrame({
    'Int': [1, 2, 3],
    'Date': ['2021-01-01', '2018-05-07', '2011-12-12'],
    'Float': [1.2, 2.3, 3.4],
    'Bool': [True, False, True],
    'Str': ['A', 'B', 'C']
})

df.dtypes

Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

If we write to a csv file and read it back, pd.read_csv() actually does a great job of inferring the original dtypes for each column—with the exception of Date.

import tempfile

with tempfile.TemporaryFile() as f:
    df.to_csv(f, index=False)
    _ = f.seek(0)
    df2 = pd.read_csv(f)
    
df2.dtypes

Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

Suppose we are dealing with a df whose columns are all dtype object—perhaps the output from some other package or function that isn’t quite so diligent as pd.read_csv().

df_as_all_obj = df2.astype(object)
df_as_all_obj.dtypes

Int      object
Date     object
Float    object
Bool     object
Str      object
dtype: object

We can still recover them using df.convert_dtypes().

This also makes use of more modern dtypes when possible, incl. nullable ints and explicit strings rather than just Object.

df_as_all_obj.convert_dtypes().dtypes

Int        Int64
Date      string
Float    Float64
Bool     boolean
Str       string
dtype: object

Strings

cities = ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore']

df = pd.DataFrame({
    'age': np.random.randint(20, 80, size=100),
    'location': np.random.choice(cities, size=100)
}).rename_axis('user_id')

df
df.dtypes

	age	location
user_id
0	33	Singapore
1	27	Chicago
2	46	Toronto
3	28	Toronto
4	50	Toronto
...	...	...
95	76	New York
96	73	Berlin
97	27	New York
98	43	Singapore
99	53	Paris

100 rows × 2 columns

age          int64
location    object
dtype: object

Replacing values

☠️ Do not use df.map(dict) because it does not handle values which are not present in the dict argument.

df['location'].replace({'Berlin': 'Germany'}).head()

user_id
0    Singapore
1      Chicago
2      Toronto
3      Toronto
4      Toronto
Name: location, dtype: object

Numeric

Assign percentage columns

df = pd.DataFrame({
    'dim': ['A', 'B', 'C'],
    'num_A': [1, 2, 3],
    'num_B': [9, 8, 7],
    'random_col': [True, False, True]
})

df

	dim	num_A	num_B	random_col
0	A	1	9	True
1	B	2	8	False
2	C	3	7	True

from typing import Union, List
import itertools

def assign_pct_cols(df: pd.DataFrame, target_cols: Union[str, List[str], None]=None, debug=False) -> pd.DataFrame:
    """ Assign pct_ cols for a specified set of num_ cols, preserving column order. """

    if target_cols is None:
        target_cols = [x for x in df.columns if x.startswith('num_') or x.startswith('n_')]
    elif isinstance(target_cols, str):
        target_cols = [target_cols]

    old_cols = list(df.columns)
    df = df.copy()

    new_cols = [col.replace('num_', 'pct_') for col in target_cols]
    
    for num_col, pct_col in zip(target_cols, new_cols):
        df[pct_col] = df[num_col] / df[num_col].sum()
        old_idx = old_cols.index(num_col)
        old_cols.insert(old_idx+1, pct_col)

    idx = list(itertools.chain(*zip(target_cols, new_cols)))

    if debug:
        print(f'old_cols: {old_cols}')
        print(f'target_cols: {target_cols}')
        print(f'new_cols: {new_cols}')
        print(f'idx: {idx}')

    return df.reindex(old_cols, axis=1)

assign_pct_cols(df)

	dim	num_A	pct_A	num_B	pct_B	random_col
0	A	1	0.166667	9	0.375000	True
1	B	2	0.333333	8	0.333333	False
2	C	3	0.500000	7	0.291667	True

Apply

df = pd.DataFrame({
    'X': [1, 2, 3],
    'Y': [4, 5, 6],
    'Z': [7, 8, 9]
}, index=['A', 'B', 'C'])

df

	X	Y	Z
A	1	4	7
B	2	5	8
C	3	6	9

df.apply(lambda x: x.sum(), axis=0)

X     6
Y    15
Z    24
dtype: int64

df.apply(lambda x: x.sum(), axis=1)

A    12
B    15
C    18
dtype: int64

Assign

Using DataFrame.assign() is a method-chaining friendly approach for assigning new columns.

df.assign(double_Y=lambda x: x['Y'] * 2)

	X	Y	Z	double_Y
A	1	4	7	8
B	2	5	8	10
C	3	6	9	12

But one drawback is that the column name must be a valid python variable name. You can get around this by setting column names in a dict, and then passing that as kwargs to assign.

doubled = df.assign(**{'Double Y': lambda x: x['Y'] * 2})
doubled

	X	Y	Z	Double Y
A	1	4	7	8
B	2	5	8	10
C	3	6	9	12

Sort

Sort by sum across multiple columns

Suppose we have a dataframe with a numeric value that is aggregated across multiple dimensions which have a hierarchical structure, such as:

Country → State/Province → City

df = pd.DataFrame([
        ['Canada', 'Ontario', 'Toronto', 5.6],
        ['Canada', 'Ontario', 'Ottawa', 1.1],
        ['Canada', 'Ontario', 'London', 0.4],
        ['Canada', 'Quebec', 'Quebec City', 0.7],
        ['Canada', 'Quebec', 'Montreal', 3.7],
        ['USA', 'New York', 'NYC', 8.6],
        ['USA', 'New York', 'Buffalo', 0.3],
        ['USA', 'Florida', 'Orlando', 0.3],
        ['USA', 'Florida', 'Miami', 0.4],
    ], columns=['country', 'region', 'city', 'num_'])

df.sort_values(by='num_', ascending=False)

	country	region	city	num_
5	USA	New York	NYC	8.6
0	Canada	Ontario	Toronto	5.6
4	Canada	Quebec	Montreal	3.7
1	Canada	Ontario	Ottawa	1.1
3	Canada	Quebec	Quebec City	0.7
2	Canada	Ontario	London	0.4
8	USA	Florida	Miami	0.4
6	USA	New York	Buffalo	0.3
7	USA	Florida	Orlando	0.3

Simply sorting by the value will put all the largest cities on top. This is fine if you’re simply trying to identify big cities, but doesn’t give a great intuition for the other levels of the hierarchy.

def multi_level_groupby_sort(df):
    """ Sort by aggregate of multiple levels """
    
    *dim_cols, num_col = df.columns.tolist()
    df = df.set_index(dim_cols)
    
    for i, c in enumerate(dim_cols):
        df[f'sum_{i}'] = df.groupby(level=i)[num_col].transform('sum')

    sort_cols = [f'sum_{i}' for i, _ in enumerate(dim_cols)]

    return (
        df
        .sort_values(by=sort_cols, ascending=False)
        .drop(sort_cols, axis=1)
        .assign(pct_=lambda x: x[num_col] / x[num_col].sum())
    )

multi_level_groupby_sort(df)

			num_	pct_
country	region	city
Canada	Ontario	Toronto	5.6	0.265403
		Ottawa	1.1	0.052133
		London	0.4	0.018957
	Quebec	Montreal	3.7	0.175355
	Quebec	Quebec City	0.7	0.033175
USA	New York	NYC	8.6	0.407583
	New York	Buffalo	0.3	0.014218
	Florida	Miami	0.4	0.018957
	Florida	Orlando	0.3	0.014218

Reorder columns

def move_column_to_end_of_df(df: pd.DataFrame, col: str = 'Other', offset: int = 0) -> pd.DataFrame:
    """ Reindex a dataframe to have a specified column at the end. """
    
    cols = df.columns.tolist()
    if col in cols:
        if offset == 0:
            cols.append(cols.pop(cols.index(col)))
        else: 
            cols.insert(-offset, cols.pop(cols.index(col)))
    return df.reindex(cols, axis=1)


doubled.pipe(move_column_to_end_of_df, col='Double Y', offset=1)

	X	Y	Double Y	Z
A	1	4	8	7
B	2	5	10	8
C	3	6	12	9