Geoff Ruddock

Interactive time series charts (with Bokeh)

import os, sys
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import bokeh

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # display all output cells
Package    Version
---------  ---------
python     3.8.8
numpy      1.19.5
pandas     1.3.0
bokeh      2.3.2
from bokeh.plotting import figure, show, reset_output, save
from bokeh.models import Band
from bokeh.palettes import Category10
import random
from IPython.core.display import HTML
from bokeh.plotting.figure import Figure
from bokeh.plotting import figure, show, reset_output, save
from bokeh.io import output_notebook, output_file

output_notebook(hide_banner=True)

def hugo_safe_render(fig: Figure) -> HTML:
    """ Save bokeh plot as HTML and re-embed into notebook, so that it also renders in Hugo markdown. """
    reset_output()
    name = f'{random.getrandbits(32)}.html'
    output_file(name)
    _ = save(fig)
    
    return HTML(f'<embed type="text/html" src="{name}" width="100%" height="{int(fig.height+100)}"></embed>')

Single line

Basic

fig = figure(plot_height=200, sizing_mode='scale_width')

x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]

_ = fig.line(x, y, line_width=2)

hugo_safe_render(fig)

Using ColumnDataSource

from bokeh.models import ColumnDataSource

try:
    from bokeh.sampledata.stocks import AAPL
except RuntimeError:
    from bokeh import sampledata
    sampledata.download()
    from bokeh.sampledata.stocks import AAPL

aapl_df = (
    pd.DataFrame(
        {'price': AAPL['adj_close']},
        index=pd.DatetimeIndex(AAPL['date']).rename('date')
    ).reset_index()
    .assign(lower=lambda x: x['price'] - 50, upper=lambda x: x['price'] + 50)   
)

cds_aapl = ColumnDataSource(aapl_df)

fig = figure(
    plot_height=200,
    sizing_mode='scale_width',
    x_axis_type='datetime'
)

_ = fig.line('date', 'price', source=cds_aapl)

hugo_safe_render(fig)

HoverTool

Bokeh docs: HoverTool

# reset_output()
# output_notebook()

from bokeh.models import HoverTool, DatetimeTickFormatter

fig = figure(
    plot_height=200,
    sizing_mode='scale_width',
    x_axis_type='datetime'
)

cds_aapl = ColumnDataSource(aapl_df)
_ = fig.line('date', 'price', source=cds_aapl)


def datetime(x):
    return np.array(x, dtype=np.datetime64)

fig.add_tools(HoverTool(
    tooltips=[
        ('Date', '@date{%F}'),
        ('Price', '@price')
    ],
    formatters={'@date': 'datetime'},
    mode='vline'
))

fig.xaxis.formatter = DatetimeTickFormatter(
    months=['%b %Y'],
    days=['%b %d']
)

hugo_safe_render(fig)

Generic helper class

Generating a plot with bokeh involves quite a bit of boilerplate code which we likely want to re-use between plots:

Let’s create a helper class that handles all of this for us.

Generate data

from bokeh.sampledata.stocks import AAPL, GOOG

goog = pd.Series(GOOG['adj_close'], index=pd.DatetimeIndex(GOOG['date']).rename('date')).rename('GOOG')
aapl = pd.Series(AAPL['adj_close'], index=pd.DatetimeIndex(AAPL['date']).rename('date')).rename('AAPL')

df = (
    pd.concat([goog, aapl], axis=1)
    .dropna()
    .rename_axis('stock', axis=1)
    .stack()
    .rename('price')
    .to_frame()
    .assign(lower=lambda x: x['price'] - 50, upper=lambda x: x['price'] + 50)
    .set_axis(['val', 'lower', 'upper'], axis=1, inplace=False)
)

df.head()

val lower upper
date stock
2004-08-19 GOOG 100.34 50.34 150.34
AAPL 14.93 -35.07 64.93
2004-08-20 GOOG 108.31 58.31 158.31
AAPL 14.98 -35.02 64.98
2004-08-23 GOOG 109.40 59.40 159.40

Pre-process data

Depending on which sort of time series we are plotting, we may want to pass in different formats of input data:

So let’s define some functions which transform these inputs into the format expected by ColumnDataSourceโ€“a dataframe with no index.

from typing import Union, Dict


def preproc_dispatcher(data: Union[pd.Series, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
    """ Dispatch input-specific pre-processing functions for various potential data formats.
    
    Bokeh expects a DataFrame (without index) as input to ColumnDataSource.
    
    """
    
    n_idx_levels = len(data.index.names)
    is_srs = isinstance(data, pd.Series)
    is_df = isinstance(data, pd.DataFrame)
    
    if is_srs and n_idx_levels == 1:
        #print('Single index series')
        return preproc_single_index_series(data)

    elif is_srs and n_idx_levels == 2:
        #print('Multi-index series')
        return preproc_multi_index_series(data)

    elif is_df and n_idx_levels == 2:
        assert set(data.columns) == set(['val', 'lower', 'upper']), 'Expecting three column names: (val, upper, lower)'
        #print('Multi-index dataframe')
        return preproc_multi_index_df(data)
    
    else:
        raise ValueError

        
def preproc_single_index_series(srs: pd.Series) -> Dict[str, pd.DataFrame]:
    """ A single-index series just needs to be reset and given it's name. """

    return {srs.name: srs.reset_index()}


def preproc_multi_index_series(srs: pd.Series) -> Dict[str, pd.DataFrame]:
    """ A multi-index series needs to be split into groups. """

    return {
        x: sub_df.reset_index(level=1, drop=True).reset_index().sort_index()
        for x, sub_df in df.groupby(level=[1])
    }

def preproc_multi_index_df(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    """ A multi-index dataframe needs to be split into groups. """

    return {
        x: sub_df.reset_index(level=1, drop=True).reset_index().sort_index()
        for x, sub_df in df.groupby(level=[1])
    }
print('Multi-line time series\n'.upper())
multi_index_srs = df['val']

print('Input:')
multi_index_srs.head()

print('Output:')
preproc_dispatcher(multi_index_srs)
MULTI-LINE TIME SERIES

Input:





date        stock
2004-08-19  GOOG     100.34
            AAPL      14.93
2004-08-20  GOOG     108.31
            AAPL      14.98
2004-08-23  GOOG     109.40
Name: val, dtype: float64



Output:





{'AAPL':            date     val   lower   upper
 0    2004-08-19   14.93  -35.07   64.93
 1    2004-08-20   14.98  -35.02   64.98
 2    2004-08-23   15.11  -34.89   65.11
 3    2004-08-24   15.54  -34.46   65.54
 4    2004-08-25   16.07  -33.93   66.07
 ...         ...     ...     ...     ...
 2143 2013-02-25  437.00  387.00  487.00
 2144 2013-02-26  443.09  393.09  493.09
 2145 2013-02-27  438.75  388.75  488.75
 2146 2013-02-28  435.62  385.62  485.62
 2147 2013-03-01  424.83  374.83  474.83
 
 [2148 rows x 4 columns],
 'GOOG':            date     val   lower   upper
 0    2004-08-19  100.34   50.34  150.34
 1    2004-08-20  108.31   58.31  158.31
 2    2004-08-23  109.40   59.40  159.40
 3    2004-08-24  104.87   54.87  154.87
 4    2004-08-25  106.00   56.00  156.00
 ...         ...     ...     ...     ...
 2143 2013-02-25  790.77  740.77  840.77
 2144 2013-02-26  790.13  740.13  840.13
 2145 2013-02-27  799.78  749.78  849.78
 2146 2013-02-28  801.20  751.20  851.20
 2147 2013-03-01  806.19  756.19  856.19
 
 [2148 rows x 4 columns]}
print('Single-line time series\n'.upper())
single_index_srs = df['val'].xs('GOOG', level=1)

print('Input:')
single_index_srs.head()

print('Output:')
preproc_dispatcher(single_index_srs)
SINGLE-LINE TIME SERIES

Input:





date
2004-08-19    100.34
2004-08-20    108.31
2004-08-23    109.40
2004-08-24    104.87
2004-08-25    106.00
Name: val, dtype: float64



Output:





{'val':            date     val
 0    2004-08-19  100.34
 1    2004-08-20  108.31
 2    2004-08-23  109.40
 3    2004-08-24  104.87
 4    2004-08-25  106.00
 ...         ...     ...
 2143 2013-02-25  790.77
 2144 2013-02-26  790.13
 2145 2013-02-27  799.78
 2146 2013-02-28  801.20
 2147 2013-03-01  806.19
 
 [2148 rows x 2 columns]}
print('Multi-line time series with bands\n'.upper())

print('Input:')
df.head()

print('Output:')
preproc_dispatcher(df)
MULTI-LINE TIME SERIES WITH BANDS

Input:

val lower upper
date stock
2004-08-19 GOOG 100.34 50.34 150.34
AAPL 14.93 -35.07 64.93
2004-08-20 GOOG 108.31 58.31 158.31
AAPL 14.98 -35.02 64.98
2004-08-23 GOOG 109.40 59.40 159.40
Output:





{'AAPL':            date     val   lower   upper
 0    2004-08-19   14.93  -35.07   64.93
 1    2004-08-20   14.98  -35.02   64.98
 2    2004-08-23   15.11  -34.89   65.11
 3    2004-08-24   15.54  -34.46   65.54
 4    2004-08-25   16.07  -33.93   66.07
 ...         ...     ...     ...     ...
 2143 2013-02-25  437.00  387.00  487.00
 2144 2013-02-26  443.09  393.09  493.09
 2145 2013-02-27  438.75  388.75  488.75
 2146 2013-02-28  435.62  385.62  485.62
 2147 2013-03-01  424.83  374.83  474.83
 
 [2148 rows x 4 columns],
 'GOOG':            date     val   lower   upper
 0    2004-08-19  100.34   50.34  150.34
 1    2004-08-20  108.31   58.31  158.31
 2    2004-08-23  109.40   59.40  159.40
 3    2004-08-24  104.87   54.87  154.87
 4    2004-08-25  106.00   56.00  156.00
 ...         ...     ...     ...     ...
 2143 2013-02-25  790.77  740.77  840.77
 2144 2013-02-26  790.13  740.13  840.13
 2145 2013-02-27  799.78  749.78  849.78
 2146 2013-02-28  801.20  751.20  851.20
 2147 2013-03-01  806.19  756.19  856.19
 
 [2148 rows x 4 columns]}

Abstract class

from typing import Dict
from abc import ABC, abstractmethod

class TimeSeries(ABC):
    def __init__(self, data: Union[pd.Series, pd.DataFrame], title: str = ''):
        """ """
        
        self.data = self._process_data(data)
        
        indices = [df.iloc[:, 0] for df in self.data.values()]
        assert all((x == indices[0]).all() for x in indices), 'Indices are not the same'
        x_range = (indices[0].min(), indices[0].max())
        
        self.fig = self._setup_plot(title, x_range)
        fig = self._plot(self.fig, self.data)
        
    
    @staticmethod
    def _process_data(data) -> Dict[str, pd.DataFrame]:
        
        return preproc_dispatcher(data)
            
    
    @staticmethod
    def _setup_plot(title, x_range) -> Figure:
        """ Wrapper that sets up fig and applys common formatting to a bokeh plot. """

        from bokeh.models import DataRange1d

        fig = figure(
            plot_height=300,
            sizing_mode='scale_width',
            title=title,
            x_axis_type='datetime',
            # get rid of gap on x-axis
            x_range=DataRange1d(start=x_range[0], end=x_range[1])  
        )

        # FORMATTING
        fig.add_tools(HoverTool(
        tooltips=[
            ('Date', '@date{%F}'),
            ('Name', '$name'),
            
            ('Value', '@val')
        ],
        formatters={'@date': 'datetime'},
        ))

        return fig
    
    @abstractmethod
    def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
        """ Core plotting logic for a multiple time series chart. """
    
        return NotImplementedError
    
    
    def show(self):
        #show(self.fig)
        return hugo_safe_render(self.fig)
        

Example plots

Basic multi-line chart

Bokeh’s multi_line method is a bit confusing to use with ColumnDataSource and HoverTool, so it’s best to simply loop over each series and plot them individually. (src)

class MultipleTimeSeries(TimeSeries):

    @staticmethod
    def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
        """ Core plotting logic for a multiple time series chart. """
        
        for i, (stock, sub_df) in enumerate(df_dict.items()):
            color = Category10[3][i]
            cds = ColumnDataSource(sub_df)

            fig.line(            
                'date',
                'val',
                source=cds,
                name=stock,
                color=color
            )

        return fig

MultipleTimeSeries(multi_index_srs, title='Demo of multiple time series').show()

Single series

single_index_srs = df['val'].xs('GOOG', level=1)

MultipleTimeSeries(single_index_srs, title='Demo of single time series').show()

Bands

class MultipleTimeSeriesWithBands(TimeSeries):

    @staticmethod
    def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
        """ Core plotting logic for a multiple time series chart. """
    
        for i, (dim, sub_df) in enumerate(df_dict.items()):
            color = Category10[3][i]
            cds = ColumnDataSource(sub_df)

            fig.line(
                'date',
                'val',
                source=cds,
                name=dim,
                color=color
            )

            band = Band(
                base='date',
                lower='lower',
                upper='upper',
                source=cds,
                level='underlay',
                fill_alpha=0.1,
                fill_color=color,

                line_width=0
            )

            fig.add_layout(band)
        return fig

MultipleTimeSeriesWithBands(df, title='Demo of upper/lower bands').show()

Moving average

class MultipleTimeSeriesWithMovingAverage(TimeSeries):

    @staticmethod
    def _plot(fig, df_dict: Dict[str, pd.DataFrame]) -> Figure:
        """ Core plotting logic for a multiple time series chart. """
    
        for i, (dim, sub_df) in enumerate(df_dict.items()):
            color = Category10[3][i]
            
            sub_df.loc[:, 'val_ma7d'] = sub_df['val'].rolling(window=7, min_periods=1).mean()
            cds = ColumnDataSource(sub_df)

            # plot 1d
            fig.line(
                'date',
                'val',
                source=cds,
                name=f'{dim} (1d)',

                color=color,
                alpha=0.1
            )

            # plot 7d
            fig.line(
                'date',
                'val_ma7d',
                source=cds,
                name=f'{dim} (7d)',
                color=color
            )

        return fig

    
recent_df = df.loc['2012-09-28':, :]

MultipleTimeSeriesWithMovingAverage(recent_df, title='Demo of moving averages').show()

๐Ÿ“š Further reading


comments powered by Disqus