import os, sys
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import bokeh
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # display all output cells
Package Version
--------- ---------
python 3.8.8
numpy 1.19.5
pandas 1.3.0
bokeh 2.3.2
from bokeh.plotting import figure, show, reset_output, save
from bokeh.models import Band
from bokeh.palettes import Category10
import random
from IPython.core.display import HTML
from bokeh.plotting.figure import Figure
from bokeh.plotting import figure, show, reset_output, save
from bokeh.io import output_notebook, output_file
output_notebook(hide_banner=True)
def hugo_safe_render(fig: Figure) -> HTML:
""" Save bokeh plot as HTML and re-embed into notebook, so that it also renders in Hugo markdown. """
reset_output()
name = f'{random.getrandbits(32)}.html'
output_file(name)
_ = save(fig)
return HTML(f'<embed type="text/html" src="{name}" width="100%" height="{int(fig.height+100)}"></embed>')
Single line
Basic
fig = figure(plot_height=200, sizing_mode='scale_width')
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
_ = fig.line(x, y, line_width=2)
hugo_safe_render(fig)
Using ColumnDataSource
from bokeh.models import ColumnDataSource
try:
from bokeh.sampledata.stocks import AAPL
except RuntimeError:
from bokeh import sampledata
sampledata.download()
from bokeh.sampledata.stocks import AAPL
aapl_df = (
pd.DataFrame(
{'price': AAPL['adj_close']},
index=pd.DatetimeIndex(AAPL['date']).rename('date')
).reset_index()
.assign(lower=lambda x: x['price'] - 50, upper=lambda x: x['price'] + 50)
)
cds_aapl = ColumnDataSource(aapl_df)
fig = figure(
plot_height=200,
sizing_mode='scale_width',
x_axis_type='datetime'
)
_ = fig.line('date', 'price', source=cds_aapl)
hugo_safe_render(fig)
HoverTool
- field names that begin with
$
are special fields, e.g. coordinates @
are columns of aColumnDataSource
- Combined as
@$name
looks up the value of$name
at hover point and looks for that column.
# reset_output()
# output_notebook()
from bokeh.models import HoverTool, DatetimeTickFormatter
fig = figure(
plot_height=200,
sizing_mode='scale_width',
x_axis_type='datetime'
)
cds_aapl = ColumnDataSource(aapl_df)
_ = fig.line('date', 'price', source=cds_aapl)
def datetime(x):
return np.array(x, dtype=np.datetime64)
fig.add_tools(HoverTool(
tooltips=[
('Date', '@date{%F}'),
('Price', '@price')
],
formatters={'@date': 'datetime'},
mode='vline'
))
fig.xaxis.formatter = DatetimeTickFormatter(
months=['%b %Y'],
days=['%b %d']
)
hugo_safe_render(fig)
Generic helper class
Generating a plot with bokeh involves quite a bit of boilerplate code which we likely want to re-use between plots:
- Instantiating the
figure
object with a datetime axis type - Adding tooltips with
HoverTool
- Formatting the date labels on the x-axis
Let’s create a helper class that handles all of this for us.
Generate data
from bokeh.sampledata.stocks import AAPL, GOOG
goog = pd.Series(GOOG['adj_close'], index=pd.DatetimeIndex(GOOG['date']).rename('date')).rename('GOOG')
aapl = pd.Series(AAPL['adj_close'], index=pd.DatetimeIndex(AAPL['date']).rename('date')).rename('AAPL')
df = (
pd.concat([goog, aapl], axis=1)
.dropna()
.rename_axis('stock', axis=1)
.stack()
.rename('price')
.to_frame()
.assign(lower=lambda x: x['price'] - 50, upper=lambda x: x['price'] + 50)
.set_axis(['val', 'lower', 'upper'], axis=1, inplace=False)
)
df.head()
val | lower | upper | ||
---|---|---|---|---|
date | stock | |||
2004-08-19 | GOOG | 100.34 | 50.34 | 150.34 |
AAPL | 14.93 | -35.07 | 64.93 | |
2004-08-20 | GOOG | 108.31 | 58.31 | 158.31 |
AAPL | 14.98 | -35.02 | 64.98 | |
2004-08-23 | GOOG | 109.40 | 59.40 | 159.40 |
Pre-process data
Depending on which sort of time series we are plotting, we may want to pass in different formats of input data:
- Single time series โ pandas series (with a single index level)
- Multi-line time series โ pandas series with a multilevel index
- Single/multi-line time series with bands โ pandas dataframe with a single/multi index
So let’s define some functions which transform these inputs into the format expected by ColumnDataSource
โa dataframe with no index.
from typing import Union, Dict
def preproc_dispatcher(data: Union[pd.Series, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
""" Dispatch input-specific pre-processing functions for various potential data formats.
Bokeh expects a DataFrame (without index) as input to ColumnDataSource.
"""
n_idx_levels = len(data.index.names)
is_srs = isinstance(data, pd.Series)
is_df = isinstance(data, pd.DataFrame)
if is_srs and n_idx_levels == 1:
#print('Single index series')
return preproc_single_index_series(data)
elif is_srs and n_idx_levels == 2:
#print('Multi-index series')
return preproc_multi_index_series(data)
elif is_df and n_idx_levels == 2:
assert set(data.columns) == set(['val', 'lower', 'upper']), 'Expecting three column names: (val, upper, lower)'
#print('Multi-index dataframe')
return preproc_multi_index_df(data)
else:
raise ValueError
def preproc_single_index_series(srs: pd.Series) -> Dict[str, pd.DataFrame]:
""" A single-index series just needs to be reset and given it's name. """
return {srs.name: srs.reset_index()}
def preproc_multi_index_series(srs: pd.Series) -> Dict[str, pd.DataFrame]:
""" A multi-index series needs to be split into groups. """
return {
x: sub_df.reset_index(level=1, drop=True).reset_index().sort_index()
for x, sub_df in df.groupby(level=[1])
}
def preproc_multi_index_df(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
""" A multi-index dataframe needs to be split into groups. """
return {
x: sub_df.reset_index(level=1, drop=True).reset_index().sort_index()
for x, sub_df in df.groupby(level=[1])
}
print('Multi-line time series\n'.upper())
multi_index_srs = df['val']
print('Input:')
multi_index_srs.head()
print('Output:')
preproc_dispatcher(multi_index_srs)
MULTI-LINE TIME SERIES
Input:
date stock
2004-08-19 GOOG 100.34
AAPL 14.93
2004-08-20 GOOG 108.31
AAPL 14.98
2004-08-23 GOOG 109.40
Name: val, dtype: float64
Output:
{'AAPL': date val lower upper
0 2004-08-19 14.93 -35.07 64.93
1 2004-08-20 14.98 -35.02 64.98
2 2004-08-23 15.11 -34.89 65.11
3 2004-08-24 15.54 -34.46 65.54
4 2004-08-25 16.07 -33.93 66.07
... ... ... ... ...
2143 2013-02-25 437.00 387.00 487.00
2144 2013-02-26 443.09 393.09 493.09
2145 2013-02-27 438.75 388.75 488.75
2146 2013-02-28 435.62 385.62 485.62
2147 2013-03-01 424.83 374.83 474.83
[2148 rows x 4 columns],
'GOOG': date val lower upper
0 2004-08-19 100.34 50.34 150.34
1 2004-08-20 108.31 58.31 158.31
2 2004-08-23 109.40 59.40 159.40
3 2004-08-24 104.87 54.87 154.87
4 2004-08-25 106.00 56.00 156.00
... ... ... ... ...
2143 2013-02-25 790.77 740.77 840.77
2144 2013-02-26 790.13 740.13 840.13
2145 2013-02-27 799.78 749.78 849.78
2146 2013-02-28 801.20 751.20 851.20
2147 2013-03-01 806.19 756.19 856.19
[2148 rows x 4 columns]}
print('Single-line time series\n'.upper())
single_index_srs = df['val'].xs('GOOG', level=1)
print('Input:')
single_index_srs.head()
print('Output:')
preproc_dispatcher(single_index_srs)
SINGLE-LINE TIME SERIES
Input:
date
2004-08-19 100.34
2004-08-20 108.31
2004-08-23 109.40
2004-08-24 104.87
2004-08-25 106.00
Name: val, dtype: float64
Output:
{'val': date val
0 2004-08-19 100.34
1 2004-08-20 108.31
2 2004-08-23 109.40
3 2004-08-24 104.87
4 2004-08-25 106.00
... ... ...
2143 2013-02-25 790.77
2144 2013-02-26 790.13
2145 2013-02-27 799.78
2146 2013-02-28 801.20
2147 2013-03-01 806.19
[2148 rows x 2 columns]}
print('Multi-line time series with bands\n'.upper())
print('Input:')
df.head()
print('Output:')
preproc_dispatcher(df)
MULTI-LINE TIME SERIES WITH BANDS
Input:
val | lower | upper | ||
---|---|---|---|---|
date | stock | |||
2004-08-19 | GOOG | 100.34 | 50.34 | 150.34 |
AAPL | 14.93 | -35.07 | 64.93 | |
2004-08-20 | GOOG | 108.31 | 58.31 | 158.31 |
AAPL | 14.98 | -35.02 | 64.98 | |
2004-08-23 | GOOG | 109.40 | 59.40 | 159.40 |
Output:
{'AAPL': date val lower upper
0 2004-08-19 14.93 -35.07 64.93
1 2004-08-20 14.98 -35.02 64.98
2 2004-08-23 15.11 -34.89 65.11
3 2004-08-24 15.54 -34.46 65.54
4 2004-08-25 16.07 -33.93 66.07
... ... ... ... ...
2143 2013-02-25 437.00 387.00 487.00
2144 2013-02-26 443.09 393.09 493.09
2145 2013-02-27 438.75 388.75 488.75
2146 2013-02-28 435.62 385.62 485.62
2147 2013-03-01 424.83 374.83 474.83
[2148 rows x 4 columns],
'GOOG': date val lower upper
0 2004-08-19 100.34 50.34 150.34
1 2004-08-20 108.31 58.31 158.31
2 2004-08-23 109.40 59.40 159.40
3 2004-08-24 104.87 54.87 154.87
4 2004-08-25 106.00 56.00 156.00
... ... ... ... ...
2143 2013-02-25 790.77 740.77 840.77
2144 2013-02-26 790.13 740.13 840.13
2145 2013-02-27 799.78 749.78 849.78
2146 2013-02-28 801.20 751.20 851.20
2147 2013-03-01 806.19 756.19 856.19
[2148 rows x 4 columns]}
Abstract class
from typing import Dict
from abc import ABC, abstractmethod
class TimeSeries(ABC):
def __init__(self, data: Union[pd.Series, pd.DataFrame], title: str = ''):
""" """
self.data = self._process_data(data)
indices = [df.iloc[:, 0] for df in self.data.values()]
assert all((x == indices[0]).all() for x in indices), 'Indices are not the same'
x_range = (indices[0].min(), indices[0].max())
self.fig = self._setup_plot(title, x_range)
fig = self._plot(self.fig, self.data)
@staticmethod
def _process_data(data) -> Dict[str, pd.DataFrame]:
return preproc_dispatcher(data)
@staticmethod
def _setup_plot(title, x_range) -> Figure:
""" Wrapper that sets up fig and applys common formatting to a bokeh plot. """
from bokeh.models import DataRange1d
fig = figure(
plot_height=300,
sizing_mode='scale_width',
title=title,
x_axis_type='datetime',
# get rid of gap on x-axis
x_range=DataRange1d(start=x_range[0], end=x_range[1])
)
# FORMATTING
fig.add_tools(HoverTool(
tooltips=[
('Date', '@date{%F}'),
('Name', '$name'),
('Value', '@val')
],
formatters={'@date': 'datetime'},
))
return fig
@abstractmethod
def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
""" Core plotting logic for a multiple time series chart. """
return NotImplementedError
def show(self):
#show(self.fig)
return hugo_safe_render(self.fig)
Example plots
Basic multi-line chart
Bokeh’s multi_line
method is a bit confusing to use with ColumnDataSource
and HoverTool
, so it’s best to simply loop over each series and plot them individually. (src)
class MultipleTimeSeries(TimeSeries):
@staticmethod
def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
""" Core plotting logic for a multiple time series chart. """
for i, (stock, sub_df) in enumerate(df_dict.items()):
color = Category10[3][i]
cds = ColumnDataSource(sub_df)
fig.line(
'date',
'val',
source=cds,
name=stock,
color=color
)
return fig
MultipleTimeSeries(multi_index_srs, title='Demo of multiple time series').show()
Single series
single_index_srs = df['val'].xs('GOOG', level=1)
MultipleTimeSeries(single_index_srs, title='Demo of single time series').show()
Bands
class MultipleTimeSeriesWithBands(TimeSeries):
@staticmethod
def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
""" Core plotting logic for a multiple time series chart. """
for i, (dim, sub_df) in enumerate(df_dict.items()):
color = Category10[3][i]
cds = ColumnDataSource(sub_df)
fig.line(
'date',
'val',
source=cds,
name=dim,
color=color
)
band = Band(
base='date',
lower='lower',
upper='upper',
source=cds,
level='underlay',
fill_alpha=0.1,
fill_color=color,
line_width=0
)
fig.add_layout(band)
return fig
MultipleTimeSeriesWithBands(df, title='Demo of upper/lower bands').show()
Moving average
class MultipleTimeSeriesWithMovingAverage(TimeSeries):
@staticmethod
def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
""" Core plotting logic for a multiple time series chart. """
for i, (dim, sub_df) in enumerate(df_dict.items()):
color = Category10[3][i]
sub_df.loc[:, 'val_ma7d'] = sub_df['val'].rolling(window=7, min_periods=1).mean()
cds = ColumnDataSource(sub_df)
# plot 1d
fig.line(
'date',
'val',
source=cds,
name=f'{dim} (1d)',
color=color,
alpha=0.1
)
# plot 7d
fig.line(
'date',
'val_ma7d',
source=cds,
name=f'{dim} (7d)',
color=color
)
return fig
recent_df = df.loc['2012-09-28':, :]
MultipleTimeSeriesWithMovingAverage(recent_df, title='Demo of moving averages').show()
comments powered by Disqus