Geoff Ruddock

Data wrangling

import numpy as np
import pandas as pd
Package    Version
---------  ---------
python     3.9.12
numpy      1.21.5
pandas     1.4.2

Data types

Inferring dtypes automatically

Let’s make a toy df that contains a variety of different dtypes.

df = pd.DataFrame({
    'Int': [1, 2, 3],
    'Date': ['2021-01-01', '2018-05-07', '2011-12-12'],
    'Float': [1.2, 2.3, 3.4],
    'Bool': [True, False, True],
    'Str': ['A', 'B', 'C']
})

df.dtypes
Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

If we write to a csv file and read it back, pd.read_csv() actually does a great job of inferring the original dtypes for each column—with the exception of Date.

import tempfile

with tempfile.TemporaryFile() as f:
    df.to_csv(f, index=False)
    _ = f.seek(0)
    df2 = pd.read_csv(f)
    
df2.dtypes
Int        int64
Date      object
Float    float64
Bool        bool
Str       object
dtype: object

Suppose we are dealing with a df whose columns are all dtype object—perhaps the output from some other package or function that isn’t quite so diligent as pd.read_csv().

df_as_all_obj = df2.astype(object)
df_as_all_obj.dtypes
Int      object
Date     object
Float    object
Bool     object
Str      object
dtype: object

We can still recover them using df.convert_dtypes().

This also makes use of more modern dtypes when possible, incl. nullable ints and explicit strings rather than just Object.

df_as_all_obj.convert_dtypes().dtypes
Int        Int64
Date      string
Float    Float64
Bool     boolean
Str       string
dtype: object

Strings

cities = ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore']

df = pd.DataFrame({
    'age': np.random.randint(20, 80, size=100),
    'location': np.random.choice(cities, size=100)
}).rename_axis('user_id')

df
df.dtypes

age location
user_id
0 33 Singapore
1 27 Chicago
2 46 Toronto
3 28 Toronto
4 50 Toronto
... ... ...
95 76 New York
96 73 Berlin
97 27 New York
98 43 Singapore
99 53 Paris

100 rows × 2 columns

age          int64
location    object
dtype: object

Replacing values

☠️ Do not use df.map(dict) because it does not handle values which are not present in the dict argument.

df['location'].replace({'Berlin': 'Germany'}).head()
user_id
0    Singapore
1      Chicago
2      Toronto
3      Toronto
4      Toronto
Name: location, dtype: object

Numeric

Assign percentage columns

df = pd.DataFrame({
    'dim': ['A', 'B', 'C'],
    'num_A': [1, 2, 3],
    'num_B': [9, 8, 7],
    'random_col': [True, False, True]
})

df

dim num_A num_B random_col
0 A 1 9 True
1 B 2 8 False
2 C 3 7 True
from typing import Union, List
import itertools

def assign_pct_cols(df: pd.DataFrame, target_cols: Union[str, List[str], None]=None, debug=False) -> pd.DataFrame:
    """ Assign pct_ cols for a specified set of num_ cols, preserving column order. """

    if target_cols is None:
        target_cols = [x for x in df.columns if x.startswith('num_') or x.startswith('n_')]
    elif isinstance(target_cols, str):
        target_cols = [target_cols]

    old_cols = list(df.columns)
    df = df.copy()

    new_cols = [col.replace('num_', 'pct_') for col in target_cols]
    
    for num_col, pct_col in zip(target_cols, new_cols):
        df[pct_col] = df[num_col] / df[num_col].sum()
        old_idx = old_cols.index(num_col)
        old_cols.insert(old_idx+1, pct_col)

    idx = list(itertools.chain(*zip(target_cols, new_cols)))

    if debug:
        print(f'old_cols: {old_cols}')
        print(f'target_cols: {target_cols}')
        print(f'new_cols: {new_cols}')
        print(f'idx: {idx}')

    return df.reindex(old_cols, axis=1)

assign_pct_cols(df)

dim num_A pct_A num_B pct_B random_col
0 A 1 0.166667 9 0.375000 True
1 B 2 0.333333 8 0.333333 False
2 C 3 0.500000 7 0.291667 True

Apply

df = pd.DataFrame({
    'X': [1, 2, 3],
    'Y': [4, 5, 6],
    'Z': [7, 8, 9]
}, index=['A', 'B', 'C'])

df

X Y Z
A 1 4 7
B 2 5 8
C 3 6 9
df.apply(lambda x: x.sum(), axis=0)
X     6
Y    15
Z    24
dtype: int64
df.apply(lambda x: x.sum(), axis=1)
A    12
B    15
C    18
dtype: int64

Assign

Using DataFrame.assign() is a method-chaining friendly approach for assigning new columns.

df.assign(double_Y=lambda x: x['Y'] * 2)

X Y Z double_Y
A 1 4 7 8
B 2 5 8 10
C 3 6 9 12

But one drawback is that the column name must be a valid python variable name. You can get around this by setting column names in a dict, and then passing that as kwargs to assign.

doubled = df.assign(**{'Double Y': lambda x: x['Y'] * 2})
doubled

X Y Z Double Y
A 1 4 7 8
B 2 5 8 10
C 3 6 9 12

Sort

Sort by sum across multiple columns

Suppose we have a dataframe with a numeric value that is aggregated across multiple dimensions which have a hierarchical structure, such as:

Country → State/Province → City

df = pd.DataFrame([
        ['Canada', 'Ontario', 'Toronto', 5.6],
        ['Canada', 'Ontario', 'Ottawa', 1.1],
        ['Canada', 'Ontario', 'London', 0.4],
        ['Canada', 'Quebec', 'Quebec City', 0.7],
        ['Canada', 'Quebec', 'Montreal', 3.7],
        ['USA', 'New York', 'NYC', 8.6],
        ['USA', 'New York', 'Buffalo', 0.3],
        ['USA', 'Florida', 'Orlando', 0.3],
        ['USA', 'Florida', 'Miami', 0.4],
    ], columns=['country', 'region', 'city', 'num_'])

df.sort_values(by='num_', ascending=False)

country region city num_
5 USA New York NYC 8.6
0 Canada Ontario Toronto 5.6
4 Canada Quebec Montreal 3.7
1 Canada Ontario Ottawa 1.1
3 Canada Quebec Quebec City 0.7
2 Canada Ontario London 0.4
8 USA Florida Miami 0.4
6 USA New York Buffalo 0.3
7 USA Florida Orlando 0.3

Simply sorting by the value will put all the largest cities on top. This is fine if you’re simply trying to identify big cities, but doesn’t give a great intuition for the other levels of the hierarchy.

def multi_level_groupby_sort(df):
    """ Sort by aggregate of multiple levels """
    
    *dim_cols, num_col = df.columns.tolist()
    df = df.set_index(dim_cols)
    
    for i, c in enumerate(dim_cols):
        df[f'sum_{i}'] = df.groupby(level=i)[num_col].transform('sum')

    sort_cols = [f'sum_{i}' for i, _ in enumerate(dim_cols)]

    return (
        df
        .sort_values(by=sort_cols, ascending=False)
        .drop(sort_cols, axis=1)
        .assign(pct_=lambda x: x[num_col] / x[num_col].sum())
    )

multi_level_groupby_sort(df)

num_ pct_
country region city
Canada Ontario Toronto 5.6 0.265403
Ottawa 1.1 0.052133
London 0.4 0.018957
Quebec Montreal 3.7 0.175355
Quebec City 0.7 0.033175
USA New York NYC 8.6 0.407583
Buffalo 0.3 0.014218
Florida Miami 0.4 0.018957
Orlando 0.3 0.014218

Reorder columns

def move_column_to_end_of_df(df: pd.DataFrame, col: str = 'Other', offset: int = 0) -> pd.DataFrame:
    """ Reindex a dataframe to have a specified column at the end. """
    
    cols = df.columns.tolist()
    if col in cols:
        if offset == 0:
            cols.append(cols.pop(cols.index(col)))
        else: 
            cols.insert(-offset, cols.pop(cols.index(col)))
    return df.reindex(cols, axis=1)


doubled.pipe(move_column_to_end_of_df, col='Double Y', offset=1)

X Y Double Y Z
A 1 4 8 7
B 2 5 10 8
C 3 6 12 9

Further reading


comments powered by Disqus