Statistical Annotations: `geom_bracket()` and `geom_bracket_dodge()`#

geom_bracket() is a general-purpose layer for annotating ranges. It is particularly effective for adding significance bars (p-values) to categorical plots.

geom_bracket_dodge() is a specialized layer designed for annotating ranges between dodged positions (usually dodged groups within a category).

Note: geom_bracket does not compute statistics internally. It is a visualization tool that renders the results of your analysis. In this demo, we use scipy.stats to calculate p-values before passing them to the plot.

import pandas as pd
from scipy.stats import mannwhitneyu

from lets_plot import *

LetsPlot.setup_html()

# In the examples below, we will use the Mann–Whitney U test to calculate p-values
def get_p_value(df, cat_col, val_col, g1, g2):
    x = df.loc[df[cat_col] == g1, val_col]
    y = df.loc[df[cat_col] == g2, val_col]
    return mannwhitneyu(x, y, alternative="two-sided").pvalue

def get_p_values_data(df, *, cat_col, val_col, base_dy, step):
    from itertools import combinations
    categories = df[cat_col].unique().tolist()
    rows = []
    for i, (xmin, xmax) in enumerate(combinations(categories, 2)):
        y = base_dy + i * step
        p = get_p_value(df, cat_col, val_col, xmin, xmax)
        rows.append({"y": y, "p": p,
                     "xmin": xmin,
                     "xmax": xmax})
    return pd.DataFrame(rows)

df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/mpg.csv")
print(df.shape)
df.head()

(234, 12)

	Unnamed: 0	manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
0	1	audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
1	2	audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
2	3	audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
3	4	audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
4	5	audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

Adding Significance Bars (p-values) to Categorical Plots#

Compute Significance Data#

p_values_df = get_p_values_data(df, cat_col="drv", val_col="hwy", base_dy=47, step=4)
print(p_values_df.shape)
p_values_df

(3, 4)

	y	p	xmin	xmax
0	47	9.041090e-28	f	4
1	51	5.955333e-11	f	r
2	55	4.104577e-02	4	r

p = ggplot(df, aes("drv", "hwy", fill="drv")) + \
    geom_boxplot(alpha=.25) + \
    geom_jitter(aes(color="drv"), height=0, shape=1, alpha=.25, show_legend=False, seed=42)

Basic p-value Annotations#

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="p"), 
                 data=p_values_df  # <-- Pass p-values data to the brackets layer
                )

Format Labels with `label_format`#

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="p"), data=p_values_df, label_format=".2~g")

Format Labels with `label_pvalue()` from `mizani`#

Mizani is a scales package for graphics: https://pypi.org/project/mizani/

from mizani.labels import label_pvalue

formatter = label_pvalue(add_p=True)
p_values_df = p_values_df.assign(label=lambda d: formatter(d["p"]))

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label"), data=p_values_df)

Apply Custom Formatting Logic (Significance Stars)#

Map p-values to custom strings (like *** or ns) using your own function.

def stars_formatter(value):
    if value <= 0.001:
        return "***"
    if value <= 0.01:
        return "**"
    if value <= 0.05:
        return "*"
    return "ns"

p_values_df = p_values_df.assign(star=lambda d: d["p"].map(stars_formatter))

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="star"), data=p_values_df)

Annotating Dodged Groups#

To draw brackets between dodged groups, use geom_bracket_dodge(). Instead of using data coordinates, this layer uses group indices:

x: The main category on the x-axis.
istart, iend: The zero-based indices of the dodged groups to compare.

# Group indexes must be specified in the same order as they appear on the plot for other layers
group_indices = {4.0: 0, 6.0: 1, 8.0: 2, 5.0: 3}
p_values_grouped_df = pd.concat([
    get_p_values_data(df[df["class"] == name], cat_col="cyl", val_col="hwy", base_dy=47, step=4).assign(
        x=name,
        star=lambda d: d["p"].map(stars_formatter) if d.shape[0] > 0 else None,
        start=lambda d: d["xmin"].map(group_indices) if d.shape[0] > 0 else None,
        end=lambda d: d["xmax"].map(group_indices) if d.shape[0] > 0 else None,
    )
    for name in df["class"].unique()
]).reset_index(drop=True)

p_values_grouped_df.head()

	y	p	xmin	xmax	x	star	start	end
0	47.0	0.000025	4.0	6.0	compact	***	0	1
1	51.0	0.910367	4.0	5.0	compact	ns	0	3
2	55.0	0.030158	6.0	5.0	compact	*	1	3
3	47.0	0.030194	6.0	8.0	midsize	*	1	2
4	51.0	0.000009	6.0	4.0	midsize	***	1	0

(ggplot(df, aes("class", "hwy", fill=as_discrete("cyl")))
  + geom_boxplot(alpha=.25)
  + geom_point(aes(color=as_discrete("cyl")),
               position=position_jitterdodge(jitter_width=.2, jitter_height=0, seed=42),
               shape=1, alpha=.25, show_legend=False, seed=42)
  + geom_bracket_dodge(aes("x", "y", istart="start", iend="end", label="star"), data=p_values_grouped_df)
  + scale_brewer(["color", "fill"], palette="Accent", format="d")
  + ggsize(1000, 500))

Customizing Bracket Geometry#

Bottom-Axis (Upside Down) Annotations#

To place brackets at the bottom of a plot, use negative values for lenstart and lenend.
You may also need to adjust vjust to position the labels correctly outside the bracket.

upside_down_p_values_df = get_p_values_data(df, cat_col="drv", val_col="hwy", base_dy=8, step=-4).assign(
    label=lambda d: formatter(d["p"])
)
print(upside_down_p_values_df.shape)
upside_down_p_values_df

(3, 5)

	y	p	xmin	xmax	label
0	8	9.041090e-28	f	4	p<0.001
1	4	5.955333e-11	f	r	p<0.001
2	0	4.104577e-02	4	r	p=0.041

(p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label"), data=upside_down_p_values_df,
                  lenstart=-5, lenend=-5,  # Negative values to reverse the direction of the brackets
                  vjust=2))                # Specify vjust to move the labels under the brackets

Precise Alignment with `tiplength_unit='identity'`#

By default, bracket tips use a fixed length. Setting tiplength_unit='identity' allows you to specify tip lengths in data units.
This is essential when you want a bracket tip to reach a specific coordinate, such as the exact top of a boxplot or a particular data point.

# Let's recall the y-coordinates from the datasets
print(f"Upper limits of boxes for each category:\n{df.groupby('drv')['hwy'].max().to_dict()}\n")
print("Dataset with p-values:")
p_values_df

Upper limits of boxes for each category:
{'4': 28, 'f': 44, 'r': 26}

Dataset with p-values:

	y	p	xmin	xmax	label	star
0	47	9.041090e-28	f	4	p<0.001	***
1	51	5.955333e-11	f	r	p<0.001	***
2	55	4.104577e-02	4	r	p=0.041	*

# The tip lengths are chosen so that each tip reaches either the previous bracket or the box
# Example (first bracket):
#   lenstart = y - upper_box_limit_at_xmin - 1
#     -> 47 - 44 - 1 = 2
#   lenend   = y - upper_box_limit_at_xmax - 1
#     -> 47 - 28 - 1 = 18
(p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label",
                      lenstart=[2, 1, 1],
                      lenend=[18, 24, 1]),
                  data=p_values_df,
                  tiplength_unit='identity'))  # identity units (data space)

Non-Statistical Annotations: Range Grouping#

Brackets can be used for more than just drawing p-values. For example, they can be used to highlight cluster boundaries in the following scatter plot:

def get_continuous_brackets_data(df, cat_col, primary_val_col, secondary_val_col, step_mult):
    secondary_min, secondary_max = df[secondary_val_col].min(), df[secondary_val_col].max()
    step = (secondary_max - secondary_min) * step_mult
    base = secondary_max + step
    df = df.copy()
    df["drv"] = pd.Categorical(df["drv"], categories=["r", "4", "f"], ordered=True)
    df = df.sort_values(by="drv")
    return pd.merge(
        df.groupby(cat_col, observed=False)[primary_val_col].min().to_frame(f"min_{primary_val_col}").reset_index(),
        df.groupby(cat_col, observed=False)[primary_val_col].max().to_frame(f"max_{primary_val_col}").reset_index(),
        on="drv"
    ).reset_index().assign(**{f"{secondary_val_col}_level": lambda d: base + d["index"] * step})

(ggplot()
  + geom_point(aes("hwy", "cty", color="drv"), data=df, alpha=.25, show_legend=False)

    # Horizontal brackets for hwy ranges
  + geom_bracket(aes(xmin="min_hwy", xmax="max_hwy", y="cty_level", label="drv", color="drv"),
                 data=get_continuous_brackets_data(df, "drv", "hwy", "cty", .1))
 
    # Vertical brackets for cty ranges
  + geom_bracket(aes(x="hwy_level", ymin="min_cty", ymax="max_cty", label="drv", color="drv"),
                 data=get_continuous_brackets_data(df, "drv", "cty", "hwy", .05))
  + theme_minimal() + labs(x='hwy', y='cty'))

Statistical Annotations: geom_bracket() and geom_bracket_dodge()#