Over the life of any given data analysis project, I typically use a mix of both SQL and pandas code at various points. This notebook contains some reference material and gotchas for fluently translating between the two.
import numpy as np
import pandas as pd
import sqlite3
Package Version
--------- ---------
python 3.8.8
numpy 1.21.5
pandas 1.4.2
Boolean logic
SQL truth table
Boolean comparisons in SQL typically follow three-valued logic.
query = """
WITH t(A, B) AS (
VALUES
(True, True),
(True, False),
(True, NULL),
(False, True),
(False, False),
(False, NULL),
(NULL, True),
(NULL, False),
(NULL, NULL)
)
SELECT
*,
A AND B AS "A and B",
A OR B as "A or B",
A = B AS "A = B",
A IS B AS "A is B"
FROM t
"""
db = sqlite3.connect(':memory:')
sql_truth_table = (
pd
.read_sql_query(query, db, coerce_float=False)
.astype('boolean')
)
sql_truth_table.style.applymap(bool_color_func).hide_index()
/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/3850775812.py:31: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
sql_truth_table.style.applymap(bool_color_func).hide_index()
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | True |
True | False | False | True | False | False |
True | True | False | |||
False | True | False | True | False | False |
False | False | False | False | True | True |
False | False | False | |||
True | True | False | |||
False | False | False | |||
True |
Pandas truth table (object)
The default logic for comparison in pandas—when dtype is object
—behaves somewhat differently.
Generally/roughly, nulls are treated as falsy.
pd_bool_pairs = pd.DataFrame({
'A': [True, True, True, False, False, False, None, None, None],
'B': [True, False, None, True, False, None, True, False, None]
})
def calc_comparisons(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df['A and B'] = df.A & df.B
df['A or B'] = df.A | df.B
df['A = B'] = df.A == df.B
df['A is B'] = df.A is df.B
return df
(
pd_bool_pairs
.pipe(calc_comparisons)
.style
.applymap(bool_color_func)
.hide_index()
)
/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/472396035.py:15: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
pd_bool_pairs
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | False |
True | False | False | True | False | False |
True | None | False | True | False | False |
False | True | False | True | False | False |
False | False | False | False | True | False |
False | None | False | False | False | False |
None | True | False | False | False | False |
None | False | False | False | False | False |
None | None | False | False | False | False |
When nulls are involved, OR
/ |
is not transitive. The order of A
and B
matter.
/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/2498563999.py:2: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
pd_bool_pairs
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | None | False | True | False | False |
None | True | False | False | False | False |
Pandas truth table (boolean)
Casting as a nullable boolean data type before comparisons yields identical results to SQL—for all comparisons except is
.
(
pd_bool_pairs
.astype('boolean')
.pipe(calc_comparisons)
.style
.applymap(bool_color_func)
.hide_index()
)
/var/folders/4f/ts0zsdvj3kv4c2vgs4c0_5sr0000gn/T/ipykernel_57319/2928955445.py:2: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
pd_bool_pairs
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | False |
True | False | False | True | False | False |
True | True | False | |||
False | True | False | True | False | False |
False | False | False | False | True | False |
False | False | False | |||
True | True | False | |||
False | False | False | |||
False |
Templating SQL queries
Python list to SQL query string
Sometimes you want to write an SQL query which filters on some dynamic results from a pandas DataFrame.
vals = [
'k24gpp',
'tq5p0b',
'03b5t9',
'vpie2y',
'u77lya'
]
vals_sql = ', '.join([f"'{x}'" for x in vals])
sql = f"""
SELECT name
FROM users
WHERE userid IN ({vals_sql})
"""
print(sql)
SELECT name
FROM users
WHERE userid IN ('k24gpp', 'tq5p0b', '03b5t9', 'vpie2y', 'u77lya')
Equivalent functions
Splitting strings
Suppose you have a column food_type
which looks something like:
food_type | |
---|---|
0 | vegetable:carrot |
1 | vegetable:potato |
2 | dairy:milk |
In (most flavours of) SQL you could simply write something like:
SELECT
SPLIT(food_type, ':')[0] AS category,
SPLIT(food_type, ':')[1] AS item
FROM grocery
One approach—which is more interpretable:
df['category'], df['item'] = zip(*df['food_type'].str.split(':').tolist())
df
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
Alternatively, here is pipe version that is less readable, but compatible with method chaining.
df.pipe(lambda x: x.join(pd.DataFrame(
x['food_type'].str.split(':').tolist(),
columns=['category', 'item']
)))
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
Here is a third approach, which is both chain friendly and readable, but requires defining a standalone function.
def split_to_cols(df, input_col, output_A, output_B, delimiter):
""" Split a specified column by a delimiter, creating two new columns. """
df = df.copy()
df[output_A] = df[input_col].apply(lambda x: x.split(delimiter)[0])
df[output_B] = df[input_col].apply(lambda x: x.split(delimiter)[1])
return df
df.pipe(
split_to_cols,
input_col='food_type',
output_A='category',
output_B='item',
delimiter=':'
)
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
CASE WHEN
Inspiration
- Pandas conditional creation of a series/dataframe column
- From SQL to pandas: conditional and calculated columns in pandas
df = pd.DataFrame({
'num': [1, 1, 2, None],
'letter': ['A', 'B', 'C', None]
})
df
num | letter | |
---|---|---|
0 | 1.0 | A |
1 | 1.0 | B |
2 | 2.0 | C |
3 | NaN | None |
Series
def case_when(df: pd.Series, lambdas: dict, default = None) -> pd.Series:
""" Simulates SQL's CASE WHEN syntax for pandas.
Based on a dict of conditions, evaluated sequentially using np.select.
"""
executed_lambdas = {k: v(df) for k, v in lambdas.items()}
labels, conditions = zip(*executed_lambdas.items())
arr = np.select(conditions, labels, default=default)
return pd.Series(arr)
lambdas = {
'One': lambda x: x == 1,
'Two': lambda x: x == 2,
'Three': lambda x: x == 3
}
df.assign(output=lambda x: x['num'].pipe(case_when, lambdas))
num | letter | output | |
---|---|---|---|
0 | 1.0 | A | One |
1 | 1.0 | B | One |
2 | 2.0 | C | Two |
3 | NaN | None | None |
DataFrame
df
num | letter | |
---|---|---|
0 | 1.0 | A |
1 | 1.0 | B |
2 | 2.0 | C |
3 | NaN | None |
def case_when_df(df: pd.DataFrame, lambdas: dict, default='UNEXPECTED', expect_exclusive=True, expect_nulls=False) -> pd.DataFrame:
""" Simulates SQL's CASE WHEN syntax for pandas.
Based on a dict of conditions, evaluated sequentially using np.select.
"""
assert isinstance(df, pd.DataFrame), 'Expecting DataFrame as input'
executed_lambdas = {k: v(df) for k, v in lambdas.items()}
labels, conditions = zip(*executed_lambdas.items())
if expect_exclusive:
assert pd.concat(executed_lambdas.values()).max() == 1, 'Expecting cases to be mutually exclusive'
if not expect_nulls:
null_counts_for_each_case = {l: c.isnull().sum() for l, c in executed_lambdas.items()}
assert sum(null_counts_for_each_case.values()) == 0, \
f'Not expecting nulls, but found in these cases: {[k for k, v in null_counts_for_each_case.items() if v != 0]}'
conditions = [x.astype(bool) for x in conditions] # Duct tape fix for TypeError: can only perform ops with numeric values
arr = np.select(conditions, labels, default=default)
return pd.Series(arr)
lambdas = {
'A & 1': lambda x: (x['letter'] == 'A') & (x['num'] == 1),
'1': lambda x: (x['num'] == 1),
}
df.pipe(case_when_df, lambdas)
0 A & 1
1 1
2 UNEXPECTED
3 UNEXPECTED
dtype: object
comments powered by Disqus