Moving seamlessly between pandas and SQL
Jun 01, 2022
Over the life of any given data analysis project, I typically use a mix of both SQL and pandas code at various points. This notebook contains some reference material and gotchas for fluently translating between the two.
import numpy as np
import pandas as pd
import sqlite3
Package Version
--------- ---------
python 3.8.8
numpy 1.20.1
pandas 1.3.3
Boolean logic
SQL truth table
Boolean comparisons in SQL typically follow three-valued logic.
query = """
WITH t(A, B) AS (
VALUES
(True, True),
(True, False),
(True, NULL),
(False, True),
(False, False),
(False, NULL),
(NULL, True),
(NULL, False),
(NULL, NULL)
)
SELECT
*,
A AND B AS "A and B",
A OR B as "A or B",
A = B AS "A = B",
A IS B AS "A is B"
FROM t
"""
db = sqlite3.connect(':memory:')
sql_truth_table = (
pd
.read_sql_query(query, db, coerce_float=False)
.astype('boolean')
)
sql_truth_table.style.applymap(bool_color_func).hide_index()
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | True |
True | False | False | True | False | False |
True | True | False | |||
False | True | False | True | False | False |
False | False | False | False | True | True |
False | False | False | |||
True | True | False | |||
False | False | False | |||
True |
Pandas truth table (object)
The default logic for comparison in pandas—when dtype is object
—behaves somewhat differently.
Generally/roughly, nulls are treated as falsy.
pd_bool_pairs = pd.DataFrame({
'A': [True, True, True, False, False, False, None, None, None],
'B': [True, False, None, True, False, None, True, False, None]
})
def calc_comparisons(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df['A and B'] = df.A & df.B
df['A or B'] = df.A | df.B
df['A = B'] = df.A == df.B
df['A is B'] = df.A is df.B
return df
(
pd_bool_pairs
.pipe(calc_comparisons)
.style
.applymap(bool_color_func)
.hide_index()
)
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | False |
True | False | False | True | False | False |
True | None | False | True | False | False |
False | True | False | True | False | False |
False | False | False | False | True | False |
False | None | False | False | False | False |
None | True | False | False | False | False |
None | False | False | False | False | False |
None | None | False | False | False | False |
When nulls are involved, OR
/ |
is not transitive. The order of A
and B
matter.
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | None | False | True | False | False |
None | True | False | False | False | False |
Pandas truth table (boolean)
Casting as a nullable boolean data type before comparisons yields identical results to SQL—for all comparisons except is
.
(
pd_bool_pairs
.astype('boolean')
.pipe(calc_comparisons)
.style
.applymap(bool_color_func)
.hide_index()
)
A | B | A and B | A or B | A = B | A is B |
---|---|---|---|---|---|
True | True | True | True | True | False |
True | False | False | True | False | False |
True | True | False | |||
False | True | False | True | False | False |
False | False | False | False | True | False |
False | False | False | |||
True | True | False | |||
False | False | False | |||
False |
Templating SQL queries
Python list to SQL query string
Sometimes you want to write an SQL query which filters on some dynamic results from a pandas DataFrame.
vals = [
'k24gpp',
'tq5p0b',
'03b5t9',
'vpie2y',
'u77lya'
]
vals_sql = ', '.join([f"'{x}'" for x in vals])
sql = f"""
SELECT name
FROM users
WHERE userid IN ({vals_sql})
"""
print(sql)
SELECT name
FROM users
WHERE userid IN ('k24gpp', 'tq5p0b', '03b5t9', 'vpie2y', 'u77lya')
Equivalent functions
Splitting strings
Suppose you have a column food_type
which looks something like:
food_type | |
---|---|
0 | vegetable:carrot |
1 | vegetable:potato |
2 | dairy:milk |
In (most flavours of) SQL you could simply write something like:
SELECT
SPLIT(food_type, ':')[0] AS category,
SPLIT(food_type, ':')[1] AS item
FROM grocery
One approach—which is more interpretable:
df['category'], df['item'] = zip(*df['food_type'].str.split(':').tolist())
df
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
Alternatively, here is pipe version that is less readable, but compatible with method chaining.
df.pipe(lambda x: x.join(pd.DataFrame(
x['food_type'].str.split(':').tolist(),
columns=['category', 'item']
)))
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
Here is a third approach, which is both chain friendly and readable, but requires defining a standalone function.
def split_to_cols(df, input_col, output_A, output_B, delimiter):
""" Split a specified column by a delimiter, creating two new columns. """
df = df.copy()
df[output_A] = df[input_col].apply(lambda x: x.split(delimiter)[0])
df[output_B] = df[input_col].apply(lambda x: x.split(delimiter)[1])
return df
df.pipe(
split_to_cols,
input_col='food_type',
output_A='category',
output_B='item',
delimiter=':'
)
food_type | category | item | |
---|---|---|---|
0 | vegetable:carrot | vegetable | carrot |
1 | vegetable:potato | vegetable | potato |
2 | dairy:milk | dairy | milk |
CASE WHEN
Inspiration
- Pandas conditional creation of a series/dataframe column
- From SQL to pandas: conditional and calculated columns in pandas
df = pd.DataFrame({'input': [1, 2, 3]})
df
input | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
def case_when(df: pd.Series, lambdas: dict, default = None) -> pd.Series:
""" Simulates SQL's CASE WHEN syntax for pandas.
Based on a dict of conditions, evaluated sequentially using np.select.
"""
executed_lambdas = {k: v(df) for k, v in lambdas.items()}
labels, conditions = zip(*executed_lambdas.items())
arr = np.select(conditions, labels, default=default)
return pd.Series(arr)
lambdas = {
'One': lambda x: x == 1,
'Two': lambda x: x == 2,
'Three': lambda x: x == 3
}
df.assign(output=lambda x: x['input'].pipe(case_when, lambdas))
input | output | |
---|---|---|
0 | 1 | One |
1 | 2 | Two |
2 | 3 | Three |