Geoff Ruddock

Data wrangling

import numpy as np
import pandas as pd
Package    Version
---------  ---------
python     3.9.12
numpy      1.21.5
pandas     1.4.2

Data types

Inferring dtypes automatically

Let’s make a toy df that contains a variety of different dtypes.

df = pd.DataFrame({
    'Int': [1, 2, 3],
    'Date': ['2021-01-01', '2018-05-07', '2011-12-12'],
    'Float': [1.2, 2.3, 3.4],
    'Bool': [True, False, True],
    'Str': ['A', 'B', 'C']
})

df.dtypes
Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

If we write to a csv file and read it back, pd.read_csv() actually does a great job of inferring the original dtypes for each column—with the exception of Date.

import tempfile

with tempfile.TemporaryFile() as f:
    df.to_csv(f, index=False)
    _ = f.seek(0)
    df2 = pd.read_csv(f)
    
df2.dtypes
Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

Suppose we are dealing with a df whose columns are all dtype object—perhaps the output from some other package or function that isn’t quite so diligent as pd.read_csv().

df_as_all_obj = df2.astype(object)
df_as_all_obj.dtypes
Int      object
Date     object
Float    object
Bool     object
Str      object
dtype: object

We can still recover them using df.convert_dtypes().

This also makes use of more modern dtypes when possible, incl. nullable ints and explicit strings rather than just Object.

df_as_all_obj.convert_dtypes().dtypes
Int        Int64
Date      string
Float    Float64
Bool     boolean
Str       string
dtype: object

Strings

cities = ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore']

df = pd.DataFrame({
    'age': np.random.randint(20, 80, size=100),
    'location': np.random.choice(cities, size=100)
}).rename_axis('user_id')

df
df.dtypes

agelocation
user_id
033Singapore
127Chicago
246Toronto
328Toronto
450Toronto
.........
9576New York
9673Berlin
9727New York
9843Singapore
9953Paris

100 rows × 2 columns

age          int64
location    object
dtype: object

Replacing values

☠️ Do not use df.map(dict) because it does not handle values which are not present in the dict argument.

df['location'].replace({'Berlin': 'Germany'}).head()
user_id
0    Singapore
1      Chicago
2      Toronto
3      Toronto
4      Toronto
Name: location, dtype: object

Numeric

Assign percentage columns

df = pd.DataFrame({
    'dim': ['A', 'B', 'C'],
    'num_A': [1, 2, 3],
    'num_B': [9, 8, 7],
    'random_col': [True, False, True]
})

df

dimnum_Anum_Brandom_col
0A19True
1B28False
2C37True
from typing import Union, List
import itertools

def assign_pct_cols(df: pd.DataFrame, target_cols: Union[str, List[str], None]=None, debug=False) -> pd.DataFrame:
    """ Assign pct_ cols for a specified set of num_ cols, preserving column order. """

    if target_cols is None:
        target_cols = [x for x in df.columns if x.startswith('num_') or x.startswith('n_')]
    elif isinstance(target_cols, str):
        target_cols = [target_cols]

    old_cols = list(df.columns)
    df = df.copy()

    new_cols = [col.replace('num_', 'pct_') for col in target_cols]
    
    for num_col, pct_col in zip(target_cols, new_cols):
        df[pct_col] = df[num_col] / df[num_col].sum()
        old_idx = old_cols.index(num_col)
        old_cols.insert(old_idx+1, pct_col)

    idx = list(itertools.chain(*zip(target_cols, new_cols)))

    if debug:
        print(f'old_cols: {old_cols}')
        print(f'target_cols: {target_cols}')
        print(f'new_cols: {new_cols}')
        print(f'idx: {idx}')

    return df.reindex(old_cols, axis=1)

assign_pct_cols(df)

dimnum_Apct_Anum_Bpct_Brandom_col
0A10.16666790.375000True
1B20.33333380.333333False
2C30.50000070.291667True

Apply

df = pd.DataFrame({
    'X': [1, 2, 3],
    'Y': [4, 5, 6],
    'Z': [7, 8, 9]
}, index=['A', 'B', 'C'])

df

XYZ
A147
B258
C369
df.apply(lambda x: x.sum(), axis=0)
X     6
Y    15
Z    24
dtype: int64
df.apply(lambda x: x.sum(), axis=1)
A    12
B    15
C    18
dtype: int64

Assign

Using DataFrame.assign() is a method-chaining friendly approach for assigning new columns.

df.assign(double_Y=lambda x: x['Y'] * 2)

XYZdouble_Y
A1478
B25810
C36912

But one drawback is that the column name must be a valid python variable name. You can get around this by setting column names in a dict, and then passing that as kwargs to assign.

doubled = df.assign(**{'Double Y': lambda x: x['Y'] * 2})
doubled

XYZDouble Y
A1478
B25810
C36912

Sort

Sort by sum across multiple columns

Suppose we have a dataframe with a numeric value that is aggregated across multiple dimensions which have a hierarchical structure, such as:

Country → State/Province → City

df = pd.DataFrame([
        ['Canada', 'Ontario', 'Toronto', 5.6],
        ['Canada', 'Ontario', 'Ottawa', 1.1],
        ['Canada', 'Ontario', 'London', 0.4],
        ['Canada', 'Quebec', 'Quebec City', 0.7],
        ['Canada', 'Quebec', 'Montreal', 3.7],
        ['USA', 'New York', 'NYC', 8.6],
        ['USA', 'New York', 'Buffalo', 0.3],
        ['USA', 'Florida', 'Orlando', 0.3],
        ['USA', 'Florida', 'Miami', 0.4],
    ], columns=['country', 'region', 'city', 'num_'])

df.sort_values(by='num_', ascending=False)

countryregioncitynum_
5USANew YorkNYC8.6
0CanadaOntarioToronto5.6
4CanadaQuebecMontreal3.7
1CanadaOntarioOttawa1.1
3CanadaQuebecQuebec City0.7
2CanadaOntarioLondon0.4
8USAFloridaMiami0.4
6USANew YorkBuffalo0.3
7USAFloridaOrlando0.3

Simply sorting by the value will put all the largest cities on top. This is fine if you’re simply trying to identify big cities, but doesn’t give a great intuition for the other levels of the hierarchy.

def multi_level_groupby_sort(df):
    """ Sort by aggregate of multiple levels """
    
    *dim_cols, num_col = df.columns.tolist()
    df = df.set_index(dim_cols)
    
    for i, c in enumerate(dim_cols):
        df[f'sum_{i}'] = df.groupby(level=i)[num_col].transform('sum')

    sort_cols = [f'sum_{i}' for i, _ in enumerate(dim_cols)]

    return (
        df
        .sort_values(by=sort_cols, ascending=False)
        .drop(sort_cols, axis=1)
        .assign(pct_=lambda x: x[num_col] / x[num_col].sum())
    )

multi_level_groupby_sort(df)

num_pct_
countryregioncity
CanadaOntarioToronto5.60.265403
Ottawa1.10.052133
London0.40.018957
QuebecMontreal3.70.175355
Quebec City0.70.033175
USANew YorkNYC8.60.407583
Buffalo0.30.014218
FloridaMiami0.40.018957
Orlando0.30.014218

Reorder columns

def move_column_to_end_of_df(df: pd.DataFrame, col: str = 'Other', offset: int = 0) -> pd.DataFrame:
    """ Reindex a dataframe to have a specified column at the end. """
    
    cols = df.columns.tolist()
    if col in cols:
        if offset == 0:
            cols.append(cols.pop(cols.index(col)))
        else: 
            cols.insert(-offset, cols.pop(cols.index(col)))
    return df.reindex(cols, axis=1)


doubled.pipe(move_column_to_end_of_df, col='Double Y', offset=1)

XYDouble YZ
A1487
B25108
C36129

Further reading