Moving seamlessly between pandas and SQL

Over the life of any given data analysis project, I typically use a mix of both SQL and pandas code at various points. This notebook contains some reference material and gotchas for fluently translating between the two.

import numpy as np
import pandas as pd
import sqlite3

Package    Version
---------  ---------
python     3.8.8
numpy      1.21.5
pandas     1.4.2

Boolean logic

SQL truth table

Boolean comparisons in SQL typically follow three-valued logic.

query = """
WITH t(A, B) AS (
    VALUES
        (True, True),
        (True, False),
        (True, NULL),
        (False, True),
        (False, False),
        (False, NULL),
        (NULL, True),
        (NULL, False),
        (NULL, NULL)
)
SELECT
    *,
    A AND B AS "A and B",
    A OR B as "A or B",
    A = B AS "A = B",
    A IS B AS "A is B"
FROM t
"""

db = sqlite3.connect(':memory:')

sql_truth_table = (
    pd
    .read_sql_query(query, db, coerce_float=False)
    .astype('boolean')
)

sql_truth_table.style.applymap(bool_color_func).hide_index()

/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/3850775812.py:31: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  sql_truth_table.style.applymap(bool_color_func).hide_index()

A	B	A and B	A or B	A = B	A is B
True	True	True	True	True	True
True	False	False	True	False	False
True			True		False
False	True	False	True	False	False
False	False	False	False	True	True
False		False			False
	True		True		False
	False	False			False
					True

Pandas truth table (object)

The default logic for comparison in pandas—when dtype is object—behaves somewhat differently.

Generally/roughly, nulls are treated as falsy.

pd_bool_pairs = pd.DataFrame({
    'A': [True, True, True, False, False, False, None, None, None],
    'B': [True, False, None, True, False, None, True, False, None]
})

def calc_comparisons(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['A and B'] = df.A & df.B
    df['A or B'] = df.A | df.B
    df['A = B'] = df.A == df.B
    df['A is B'] = df.A is df.B
    return df

(
    pd_bool_pairs
    .pipe(calc_comparisons)
    .style
    .applymap(bool_color_func)
    .hide_index()
)

/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/472396035.py:15: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  pd_bool_pairs

A	B	A and B	A or B	A = B	A is B
True	True	True	True	True	False
True	False	False	True	False	False
True	None	False	True	False	False
False	True	False	True	False	False
False	False	False	False	True	False
False	None	False	False	False	False
None	True	False	False	False	False
None	False	False	False	False	False
None	None	False	False	False	False

When nulls are involved, OR / | is not transitive. The order of A and B matter.

/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/2498563999.py:2: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  pd_bool_pairs

A	B	A and B	A or B	A = B	A is B
True	None	False	True	False	False
None	True	False	False	False	False

Pandas truth table (boolean)

Casting as a nullable boolean data type before comparisons yields identical results to SQL—for all comparisons except is.

(
    pd_bool_pairs
    .astype('boolean')
    .pipe(calc_comparisons)
    .style
    .applymap(bool_color_func)
    .hide_index()
)

/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/2928955445.py:2: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  pd_bool_pairs

A	B	A and B	A or B	A = B	A is B
True	True	True	True	True	False
True	False	False	True	False	False
True			True		False
False	True	False	True	False	False
False	False	False	False	True	False
False		False			False
	True		True		False
	False	False			False
					False

Templating SQL queries

Python list to SQL query string

Sometimes you want to write an SQL query which filters on some dynamic results from a pandas DataFrame.

vals = [
    'k24gpp',
    'tq5p0b',
    '03b5t9',
    'vpie2y',
    'u77lya'
]

vals_sql = ', '.join([f"'{x}'" for x in vals])

sql = f"""
SELECT name
FROM users
WHERE userid IN ({vals_sql})
"""

print(sql)

SELECT name
FROM users
WHERE userid IN ('k24gpp', 'tq5p0b', '03b5t9', 'vpie2y', 'u77lya')

Equivalent functions

Splitting strings

Suppose you have a column food_type which looks something like:

	food_type
0	vegetable:carrot
1	vegetable:potato
2	dairy:milk

In (most flavours of) SQL you could simply write something like:

SELECT
    SPLIT(food_type, ':')[0] AS category,
    SPLIT(food_type, ':')[1] AS item
FROM grocery

One approach—which is more interpretable:

df['category'], df['item'] = zip(*df['food_type'].str.split(':').tolist())
df

	food_type	category	item
0	vegetable:carrot	vegetable	carrot
1	vegetable:potato	vegetable	potato
2	dairy:milk	dairy	milk

Alternatively, here is pipe version that is less readable, but compatible with method chaining.

df.pipe(lambda x: x.join(pd.DataFrame(
    x['food_type'].str.split(':').tolist(),
    columns=['category', 'item']
)))

	food_type	category	item
0	vegetable:carrot	vegetable	carrot
1	vegetable:potato	vegetable	potato
2	dairy:milk	dairy	milk

Here is a third approach, which is both chain friendly and readable, but requires defining a standalone function.

def split_to_cols(df, input_col, output_A, output_B, delimiter):
    """ Split a specified column by a delimiter, creating two new columns. """
    
    df = df.copy()
    df[output_A] = df[input_col].apply(lambda x: x.split(delimiter)[0])
    df[output_B] = df[input_col].apply(lambda x: x.split(delimiter)[1])
    
    return df

df.pipe(
    split_to_cols,
    input_col='food_type',
    output_A='category',
    output_B='item',
    delimiter=':'
)

	food_type	category	item
0	vegetable:carrot	vegetable	carrot
1	vegetable:potato	vegetable	potato
2	dairy:milk	dairy	milk

CASE WHEN

Inspiration

df = pd.DataFrame({
    'num': [1, 1, 2, None],
    'letter': ['A', 'B', 'C', None]
})
df

	num	letter
0	1.0	A
1	1.0	B
2	2.0	C
3	NaN	None

Series

def case_when(df: pd.Series, lambdas: dict, default = None) -> pd.Series:
    """ Simulates SQL's CASE WHEN syntax for pandas. 
    Based on a dict of conditions, evaluated sequentially using np.select.
    """
    
    executed_lambdas = {k: v(df) for k, v in lambdas.items()}
    labels, conditions = zip(*executed_lambdas.items())
    arr = np.select(conditions, labels, default=default)
    
    return pd.Series(arr)


lambdas = {
    'One': lambda x: x == 1,
    'Two': lambda x: x == 2,
    'Three': lambda x: x == 3
}

df.assign(output=lambda x: x['num'].pipe(case_when, lambdas))

	num	letter	output
0	1.0	A	One
1	1.0	B	One
2	2.0	C	Two
3	NaN	None	None

DataFrame

df

	num	letter
0	1.0	A
1	1.0	B
2	2.0	C
3	NaN	None

def case_when_df(df: pd.DataFrame, lambdas: dict, default='UNEXPECTED', expect_exclusive=True, expect_nulls=False) -> pd.DataFrame:
    """ Simulates SQL's CASE WHEN syntax for pandas. 
    Based on a dict of conditions, evaluated sequentially using np.select.
    """

    assert isinstance(df, pd.DataFrame), 'Expecting DataFrame as input'
    
    executed_lambdas = {k: v(df) for k, v in lambdas.items()}
    labels, conditions = zip(*executed_lambdas.items())

    if expect_exclusive:
        assert pd.concat(executed_lambdas.values()).max() == 1, 'Expecting cases to be mutually exclusive'

    if not expect_nulls:
        null_counts_for_each_case = {l: c.isnull().sum() for l, c in executed_lambdas.items()}
        assert sum(null_counts_for_each_case.values()) == 0, \
            f'Not expecting nulls, but found in these cases: {[k for k, v in null_counts_for_each_case.items() if v != 0]}'
        conditions = [x.astype(bool) for x in conditions]  # Duct tape fix for TypeError: can only perform ops with numeric values

    arr = np.select(conditions, labels, default=default)
    return pd.Series(arr)


lambdas = {
    'A & 1': lambda x: (x['letter'] == 'A') & (x['num'] == 1),
    '1': lambda x: (x['num'] == 1),
}

df.pipe(case_when_df, lambdas)

0         A & 1
1             1
2    UNEXPECTED
3    UNEXPECTED
dtype: object