import numpy as np
import pandas as pd
Package Version
--------- ---------
python 3.8.8
numpy 1.20.1
pandas 1.3.3
Selecting
Select only datetime columns
dates = pd.date_range('2010-01-01', '2019-12-31')
dates
df = pd.DataFrame({
'user_id': range(6),
'name': ['Alice', 'Bob', 'Charles', 'Denise', 'Eric', 'Fiona'],
'age': np.random.randint(20, 80, size=6),
'location': ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore'],
'registration_date': np.random.choice(dates, size=6),
'purchase_date': np.random.choice(dates, size=6),
'cancel_date': np.random.choice(dates, size=6)
}).set_index('user_id')
df.select_dtypes(include=['datetime64[ns]'])
| registration_date | purchase_date | cancel_date |
|---|
| user_id | | | |
|---|
| 0 | 2019-06-10 | 2017-02-16 | 2014-06-16 |
|---|
| 1 | 2011-10-12 | 2012-08-13 | 2014-01-26 |
|---|
| 2 | 2016-08-26 | 2015-08-25 | 2012-12-17 |
|---|
| 3 | 2018-11-07 | 2010-03-09 | 2010-05-16 |
|---|
| 4 | 2019-04-19 | 2014-09-16 | 2018-04-19 |
|---|
| 5 | 2019-01-02 | 2013-02-05 | 2014-03-09 |
|---|
Select only columns with non-zero values
df = pd.DataFrame(np.random.randint(1, 100, size=(10, 5)))
df.iloc[:, [2, 4]] = 0
df
| 0 | 1 | 2 | 3 | 4 |
|---|
| 0 | 5 | 51 | 0 | 77 | 0 |
|---|
| 1 | 17 | 74 | 0 | 1 | 0 |
|---|
| 2 | 89 | 12 | 0 | 19 | 0 |
|---|
| 3 | 82 | 11 | 0 | 81 | 0 |
|---|
| 4 | 65 | 84 | 0 | 59 | 0 |
|---|
| 5 | 16 | 16 | 0 | 83 | 0 |
|---|
| 6 | 8 | 92 | 0 | 54 | 0 |
|---|
| 7 | 23 | 23 | 0 | 47 | 0 |
|---|
| 8 | 48 | 30 | 0 | 83 | 0 |
|---|
| 9 | 49 | 68 | 0 | 52 | 0 |
|---|
df.loc[:, (df != 0).any(axis=0)]
| 0 | 1 | 3 |
|---|
| 0 | 5 | 51 | 77 |
|---|
| 1 | 17 | 74 | 1 |
|---|
| 2 | 89 | 12 | 19 |
|---|
| 3 | 82 | 11 | 81 |
|---|
| 4 | 65 | 84 | 59 |
|---|
| 5 | 16 | 16 | 83 |
|---|
| 6 | 8 | 92 | 54 |
|---|
| 7 | 23 | 23 | 47 |
|---|
| 8 | 48 | 30 | 83 |
|---|
| 9 | 49 | 68 | 52 |
|---|
Select using combination of loc and iloc
df.iloc[5:, df.columns.get_indexer([0, 2])]
Modifying
Cast index as datetime
df_with_string_index = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
}, index=['2020-01-01', '2020-01-02', '2020-01-03'])
df_with_string_index
| A | B |
|---|
| 2020-01-01 | 1 | 4 |
|---|
| 2020-01-02 | 2 | 5 |
|---|
| 2020-01-03 | 3 | 6 |
|---|
df_with_string_index.index
Index(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='object')
df_with_dt_index = df_with_string_index.pipe(lambda x: x.set_index(pd.to_datetime(x.index)))
df_with_dt_index
| A | B |
|---|
| 2020-01-01 | 1 | 4 |
|---|
| 2020-01-02 | 2 | 5 |
|---|
| 2020-01-03 | 3 | 6 |
|---|
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)
MultiIndex
Select rows in which one level of the multiindex match a list of values
bool_filt = (df.groupby(level='user_id').size() > 1)
user_ids_to_include = bool_filt[bool_filt].index
ms_df = df[lambda x: x.index.get_level_values('user_id').isin(user_ids_to_include)]
Convert all levels of a hierarchical index to string
df.columns.set_levels([x.astype(str) for x in x.columns.levels])
E.g. including rows from users with more than one row
bool_filt = (df.groupby(level='user_id').size() > 1)
user_ids_to_include = bool_filt[bool_filt].index
multi_search_df = df[lambda x: x.index.get_level_values('user_id').isin(user_ids_to_include)]
Levels
df = pd.DataFrame(index=list('abcde'), data={'A': range(5), 'B': range(5)})
display(df)
Add level above existing
insert_above = df.copy()
insert_above.columns = pd.MultiIndex.from_product([['Super'], df.columns])
insert_above
Add level below existing
insert_below = df.copy()
insert_below.columns = pd.MultiIndex.from_product([df.columns, ['Sub']])
insert_below
Swap existing levels
df = pd.DataFrame(index=list('abcde'), data={'sub_level_1': range(5), 'sub_level_2': range(5)})
df.columns = pd.MultiIndex.from_product([['super_level'], df.columns])
df
| super_level |
|---|
| sub_level_1 | sub_level_2 |
|---|
| a | 0 | 0 |
|---|
| b | 1 | 1 |
|---|
| c | 2 | 2 |
|---|
| d | 3 | 3 |
|---|
| e | 4 | 4 |
|---|
| sub_level_1 | sub_level_2 |
|---|
| super_level | super_level |
|---|
| a | 0 | 0 |
|---|
| b | 1 | 1 |
|---|
| c | 2 | 2 |
|---|
| d | 3 | 3 |
|---|
| e | 4 | 4 |
|---|
Modify a level
date_strs = ['2020-01-01', '2020-01-02', '2020-01-03']
dims = ['A', 'B', 'C']
idx = pd.MultiIndex.from_product([date_strs, dims])
df = pd.DataFrame(index=idx, data={'A': range(len(idx)), 'B': range(len(idx))})
display(df)
| | A | B |
|---|
| 2020-01-01 | A | 0 | 0 |
|---|
| B | 1 | 1 |
|---|
| C | 2 | 2 |
|---|
| 2020-01-02 | A | 3 | 3 |
|---|
| B | 4 | 4 |
|---|
| C | 5 | 5 |
|---|
| 2020-01-03 | A | 6 | 6 |
|---|
| B | 7 | 7 |
|---|
| C | 8 | 8 |
|---|
level_0 object
level_1 object
dtype: object
df.index.set_levels(pd.to_datetime(df.index.levels[0]), level=0)
MultiIndex([('2020-01-01', 'A'),
('2020-01-01', 'B'),
('2020-01-01', 'C'),
('2020-01-02', 'A'),
('2020-01-02', 'B'),
('2020-01-02', 'C'),
('2020-01-03', 'A'),
('2020-01-03', 'B'),
('2020-01-03', 'C')],
)
MultiIndex to DataFrame
Create a DataFrame out of the levels of a MultiIndex.
df.index.to_frame(index=False)
| 0 | 1 |
|---|
| 0 | 2020-01-01 | A |
|---|
| 1 | 2020-01-01 | B |
|---|
| 2 | 2020-01-01 | C |
|---|
| 3 | 2020-01-02 | A |
|---|
| 4 | 2020-01-02 | B |
|---|
| 5 | 2020-01-02 | C |
|---|
| 6 | 2020-01-03 | A |
|---|
| 7 | 2020-01-03 | B |
|---|
| 8 | 2020-01-03 | C |
|---|