import numpy as np
import pandas as pd
Package Version
--------- ---------
python 3.9.12
numpy 1.21.5
pandas 1.4.2
Data types
Inferring dtypes automatically
Let’s make a toy df that contains a variety of different dtypes.
df = pd.DataFrame({
'Int': [1, 2, 3],
'Date': ['2021-01-01', '2018-05-07', '2011-12-12'],
'Float': [1.2, 2.3, 3.4],
'Bool': [True, False, True],
'Str': ['A', 'B', 'C']
})
df.dtypes
Int int64
Date object
Float float64
Bool bool
Str object
dtype: object
If we write to a csv file and read it back, pd.read_csv()
actually does a great job of inferring the original dtypes for each column—with the exception of Date.
import tempfile
with tempfile.TemporaryFile() as f:
df.to_csv(f, index=False)
_ = f.seek(0)
df2 = pd.read_csv(f)
df2.dtypes
Int int64
Date object
Float float64
Bool bool
Str object
dtype: object
Suppose we are dealing with a df whose columns are all dtype object—perhaps the output from some other package or function that isn’t quite so diligent as pd.read_csv()
.
df_as_all_obj = df2.astype(object)
df_as_all_obj.dtypes
Int object
Date object
Float object
Bool object
Str object
dtype: object
We can still recover them using df.convert_dtypes()
.
This also makes use of more modern dtypes when possible, incl. nullable ints and explicit strings rather than just Object.
df_as_all_obj.convert_dtypes().dtypes
Int Int64
Date string
Float Float64
Bool boolean
Str string
dtype: object
Strings
cities = ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore']
df = pd.DataFrame({
'age': np.random.randint(20, 80, size=100),
'location': np.random.choice(cities, size=100)
}).rename_axis('user_id')
df
df.dtypes
age | location | |
---|---|---|
user_id | ||
0 | 33 | Singapore |
1 | 27 | Chicago |
2 | 46 | Toronto |
3 | 28 | Toronto |
4 | 50 | Toronto |
... | ... | ... |
95 | 76 | New York |
96 | 73 | Berlin |
97 | 27 | New York |
98 | 43 | Singapore |
99 | 53 | Paris |
100 rows × 2 columns
age int64
location object
dtype: object
Replacing values
☠️ Do not use df.map(dict)
because it does not handle values which are not present in the dict
argument.
df['location'].replace({'Berlin': 'Germany'}).head()
user_id
0 Singapore
1 Chicago
2 Toronto
3 Toronto
4 Toronto
Name: location, dtype: object
Numeric
Assign percentage columns
df = pd.DataFrame({
'dim': ['A', 'B', 'C'],
'num_A': [1, 2, 3],
'num_B': [9, 8, 7],
'random_col': [True, False, True]
})
df
dim | num_A | num_B | random_col | |
---|---|---|---|---|
0 | A | 1 | 9 | True |
1 | B | 2 | 8 | False |
2 | C | 3 | 7 | True |
from typing import Union, List
import itertools
def assign_pct_cols(df: pd.DataFrame, target_cols: Union[str, List[str], None]=None, debug=False) -> pd.DataFrame:
""" Assign pct_ cols for a specified set of num_ cols, preserving column order. """
if target_cols is None:
target_cols = [x for x in df.columns if x.startswith('num_') or x.startswith('n_')]
elif isinstance(target_cols, str):
target_cols = [target_cols]
old_cols = list(df.columns)
df = df.copy()
new_cols = [col.replace('num_', 'pct_') for col in target_cols]
for num_col, pct_col in zip(target_cols, new_cols):
df[pct_col] = df[num_col] / df[num_col].sum()
old_idx = old_cols.index(num_col)
old_cols.insert(old_idx+1, pct_col)
idx = list(itertools.chain(*zip(target_cols, new_cols)))
if debug:
print(f'old_cols: {old_cols}')
print(f'target_cols: {target_cols}')
print(f'new_cols: {new_cols}')
print(f'idx: {idx}')
return df.reindex(old_cols, axis=1)
assign_pct_cols(df)
dim | num_A | pct_A | num_B | pct_B | random_col | |
---|---|---|---|---|---|---|
0 | A | 1 | 0.166667 | 9 | 0.375000 | True |
1 | B | 2 | 0.333333 | 8 | 0.333333 | False |
2 | C | 3 | 0.500000 | 7 | 0.291667 | True |
Apply
df = pd.DataFrame({
'X': [1, 2, 3],
'Y': [4, 5, 6],
'Z': [7, 8, 9]
}, index=['A', 'B', 'C'])
df
X | Y | Z | |
---|---|---|---|
A | 1 | 4 | 7 |
B | 2 | 5 | 8 |
C | 3 | 6 | 9 |
df.apply(lambda x: x.sum(), axis=0)
X 6
Y 15
Z 24
dtype: int64
df.apply(lambda x: x.sum(), axis=1)
A 12
B 15
C 18
dtype: int64
Assign
Using DataFrame.assign()
is a method-chaining friendly approach for assigning new columns.
df.assign(double_Y=lambda x: x['Y'] * 2)
X | Y | Z | double_Y | |
---|---|---|---|---|
A | 1 | 4 | 7 | 8 |
B | 2 | 5 | 8 | 10 |
C | 3 | 6 | 9 | 12 |
But one drawback is that the column name must be a valid python variable name. You can get around this by setting column names in a dict, and then passing that as kwargs
to assign
.
doubled = df.assign(**{'Double Y': lambda x: x['Y'] * 2})
doubled
X | Y | Z | Double Y | |
---|---|---|---|---|
A | 1 | 4 | 7 | 8 |
B | 2 | 5 | 8 | 10 |
C | 3 | 6 | 9 | 12 |
Sort
Sort by sum across multiple columns
Suppose we have a dataframe with a numeric value that is aggregated across multiple dimensions which have a hierarchical structure, such as:
Country → State/Province → City
df = pd.DataFrame([
['Canada', 'Ontario', 'Toronto', 5.6],
['Canada', 'Ontario', 'Ottawa', 1.1],
['Canada', 'Ontario', 'London', 0.4],
['Canada', 'Quebec', 'Quebec City', 0.7],
['Canada', 'Quebec', 'Montreal', 3.7],
['USA', 'New York', 'NYC', 8.6],
['USA', 'New York', 'Buffalo', 0.3],
['USA', 'Florida', 'Orlando', 0.3],
['USA', 'Florida', 'Miami', 0.4],
], columns=['country', 'region', 'city', 'num_'])
df.sort_values(by='num_', ascending=False)
country | region | city | num_ | |
---|---|---|---|---|
5 | USA | New York | NYC | 8.6 |
0 | Canada | Ontario | Toronto | 5.6 |
4 | Canada | Quebec | Montreal | 3.7 |
1 | Canada | Ontario | Ottawa | 1.1 |
3 | Canada | Quebec | Quebec City | 0.7 |
2 | Canada | Ontario | London | 0.4 |
8 | USA | Florida | Miami | 0.4 |
6 | USA | New York | Buffalo | 0.3 |
7 | USA | Florida | Orlando | 0.3 |
Simply sorting by the value will put all the largest cities on top. This is fine if you’re simply trying to identify big cities, but doesn’t give a great intuition for the other levels of the hierarchy.
def multi_level_groupby_sort(df):
""" Sort by aggregate of multiple levels """
*dim_cols, num_col = df.columns.tolist()
df = df.set_index(dim_cols)
for i, c in enumerate(dim_cols):
df[f'sum_{i}'] = df.groupby(level=i)[num_col].transform('sum')
sort_cols = [f'sum_{i}' for i, _ in enumerate(dim_cols)]
return (
df
.sort_values(by=sort_cols, ascending=False)
.drop(sort_cols, axis=1)
.assign(pct_=lambda x: x[num_col] / x[num_col].sum())
)
multi_level_groupby_sort(df)
num_ | pct_ | |||
---|---|---|---|---|
country | region | city | ||
Canada | Ontario | Toronto | 5.6 | 0.265403 |
Ottawa | 1.1 | 0.052133 | ||
London | 0.4 | 0.018957 | ||
Quebec | Montreal | 3.7 | 0.175355 | |
Quebec City | 0.7 | 0.033175 | ||
USA | New York | NYC | 8.6 | 0.407583 |
Buffalo | 0.3 | 0.014218 | ||
Florida | Miami | 0.4 | 0.018957 | |
Orlando | 0.3 | 0.014218 |
Reorder columns
def move_column_to_end_of_df(df: pd.DataFrame, col: str = 'Other', offset: int = 0) -> pd.DataFrame:
""" Reindex a dataframe to have a specified column at the end. """
cols = df.columns.tolist()
if col in cols:
if offset == 0:
cols.append(cols.pop(cols.index(col)))
else:
cols.insert(-offset, cols.pop(cols.index(col)))
return df.reindex(cols, axis=1)
doubled.pipe(move_column_to_end_of_df, col='Double Y', offset=1)
X | Y | Double Y | Z | |
---|---|---|---|---|
A | 1 | 4 | 8 | 7 |
B | 2 | 5 | 10 | 8 |
C | 3 | 6 | 12 | 9 |
Further reading
- Data School’s top 25 pandas tricks [Jupyter Notebook] – Good overview of some practical pandas code. The
df.prefix()
anddf.suffix()
methods to rename columns are a neat trick that I just learned from this article.
comments powered by Disqus