import numpy as np
import pandas as pd
Package Version
--------- ---------
python 3.8.8
numpy 1.20.1
pandas 1.3.3
Selecting
Select only datetime columns
dates = pd.date_range('2010-01-01', '2019-12-31')
dates
df = pd.DataFrame({
'user_id': range(6),
'name': ['Alice', 'Bob', 'Charles', 'Denise', 'Eric', 'Fiona'],
'age': np.random.randint(20, 80, size=6),
'location': ['New York', 'Toronto', 'Chicago', 'Berlin', 'Paris', 'Singapore'],
'registration_date': np.random.choice(dates, size=6),
'purchase_date': np.random.choice(dates, size=6),
'cancel_date': np.random.choice(dates, size=6)
}).set_index('user_id')
df.select_dtypes(include=['datetime64[ns]'])
|
registration_date |
purchase_date |
cancel_date |
user_id |
|
|
|
0 |
2019-06-10 |
2017-02-16 |
2014-06-16 |
1 |
2011-10-12 |
2012-08-13 |
2014-01-26 |
2 |
2016-08-26 |
2015-08-25 |
2012-12-17 |
3 |
2018-11-07 |
2010-03-09 |
2010-05-16 |
4 |
2019-04-19 |
2014-09-16 |
2018-04-19 |
5 |
2019-01-02 |
2013-02-05 |
2014-03-09 |
Select only columns with non-zero values
df = pd.DataFrame(np.random.randint(1, 100, size=(10, 5)))
df.iloc[:, [2, 4]] = 0
df
|
0 |
1 |
2 |
3 |
4 |
0 |
5 |
51 |
0 |
77 |
0 |
1 |
17 |
74 |
0 |
1 |
0 |
2 |
89 |
12 |
0 |
19 |
0 |
3 |
82 |
11 |
0 |
81 |
0 |
4 |
65 |
84 |
0 |
59 |
0 |
5 |
16 |
16 |
0 |
83 |
0 |
6 |
8 |
92 |
0 |
54 |
0 |
7 |
23 |
23 |
0 |
47 |
0 |
8 |
48 |
30 |
0 |
83 |
0 |
9 |
49 |
68 |
0 |
52 |
0 |
df.loc[:, (df != 0).any(axis=0)]
|
0 |
1 |
3 |
0 |
5 |
51 |
77 |
1 |
17 |
74 |
1 |
2 |
89 |
12 |
19 |
3 |
82 |
11 |
81 |
4 |
65 |
84 |
59 |
5 |
16 |
16 |
83 |
6 |
8 |
92 |
54 |
7 |
23 |
23 |
47 |
8 |
48 |
30 |
83 |
9 |
49 |
68 |
52 |
Select using combination of loc and iloc
df.iloc[5:, df.columns.get_indexer([0, 2])]
|
0 |
2 |
5 |
16 |
0 |
6 |
8 |
0 |
7 |
23 |
0 |
8 |
48 |
0 |
9 |
49 |
0 |
Modifying
Cast index as datetime
df_with_string_index = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
}, index=['2020-01-01', '2020-01-02', '2020-01-03'])
df_with_string_index
|
A |
B |
2020-01-01 |
1 |
4 |
2020-01-02 |
2 |
5 |
2020-01-03 |
3 |
6 |
df_with_string_index.index
Index(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='object')
df_with_dt_index = df_with_string_index.pipe(lambda x: x.set_index(pd.to_datetime(x.index)))
df_with_dt_index
|
A |
B |
2020-01-01 |
1 |
4 |
2020-01-02 |
2 |
5 |
2020-01-03 |
3 |
6 |
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)
MultiIndex
Select rows in which one level of the multiindex match a list of values
bool_filt = (df.groupby(level='user_id').size() > 1)
user_ids_to_include = bool_filt[bool_filt].index
ms_df = df[lambda x: x.index.get_level_values('user_id').isin(user_ids_to_include)]
Convert all levels of a hierarchical index to string
df.columns.set_levels([x.astype(str) for x in x.columns.levels])
E.g. including rows from users with more than one row
bool_filt = (df.groupby(level='user_id').size() > 1)
user_ids_to_include = bool_filt[bool_filt].index
multi_search_df = df[lambda x: x.index.get_level_values('user_id').isin(user_ids_to_include)]
Levels
df = pd.DataFrame(index=list('abcde'), data={'A': range(5), 'B': range(5)})
display(df)
|
A |
B |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
Add level above existing
insert_above = df.copy()
insert_above.columns = pd.MultiIndex.from_product([['Super'], df.columns])
insert_above
|
Super |
|
A |
B |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
Add level below existing
insert_below = df.copy()
insert_below.columns = pd.MultiIndex.from_product([df.columns, ['Sub']])
insert_below
|
A |
B |
|
Sub |
Sub |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
Swap existing levels
df = pd.DataFrame(index=list('abcde'), data={'sub_level_1': range(5), 'sub_level_2': range(5)})
df.columns = pd.MultiIndex.from_product([['super_level'], df.columns])
df
|
super_level |
|
sub_level_1 |
sub_level_2 |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
|
sub_level_1 |
sub_level_2 |
|
super_level |
super_level |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
Modify a level
date_strs = ['2020-01-01', '2020-01-02', '2020-01-03']
dims = ['A', 'B', 'C']
idx = pd.MultiIndex.from_product([date_strs, dims])
df = pd.DataFrame(index=idx, data={'A': range(len(idx)), 'B': range(len(idx))})
display(df)
|
|
A |
B |
2020-01-01 |
A |
0 |
0 |
B |
1 |
1 |
C |
2 |
2 |
2020-01-02 |
A |
3 |
3 |
B |
4 |
4 |
C |
5 |
5 |
2020-01-03 |
A |
6 |
6 |
B |
7 |
7 |
C |
8 |
8 |
level_0 object
level_1 object
dtype: object
df.index.set_levels(pd.to_datetime(df.index.levels[0]), level=0)
MultiIndex([('2020-01-01', 'A'),
('2020-01-01', 'B'),
('2020-01-01', 'C'),
('2020-01-02', 'A'),
('2020-01-02', 'B'),
('2020-01-02', 'C'),
('2020-01-03', 'A'),
('2020-01-03', 'B'),
('2020-01-03', 'C')],
)
MultiIndex to DataFrame
Create a DataFrame out of the levels of a MultiIndex.
df.index.to_frame(index=False)
|
0 |
1 |
0 |
2020-01-01 |
A |
1 |
2020-01-01 |
B |
2 |
2020-01-01 |
C |
3 |
2020-01-02 |
A |
4 |
2020-01-02 |
B |
5 |
2020-01-02 |
C |
6 |
2020-01-03 |
A |
7 |
2020-01-03 |
B |
8 |
2020-01-03 |
C |
comments powered by Disqus