Download notebook (.ipynb)

Statistical Annotations: geom_bracket() and geom_bracket_dodge()#

geom_bracket() is a general-purpose layer for annotating ranges. It is particularly effective for adding significance bars (p-values) to categorical plots.

geom_bracket_dodge() is a specialized layer designed for annotating ranges between dodged positions (usually dodged groups within a category).

Note: geom_bracket does not compute statistics internally. It is a visualization tool that renders the results of your analysis. In this demo, we use scipy.stats to calculate p-values before passing them to the plot.

import pandas as pd
from scipy.stats import mannwhitneyu

from lets_plot import *
LetsPlot.setup_html()
# In the examples below, we will use the Mann–Whitney U test to calculate p-values
def get_p_value(df, cat_col, val_col, g1, g2):
    x = df.loc[df[cat_col] == g1, val_col]
    y = df.loc[df[cat_col] == g2, val_col]
    return mannwhitneyu(x, y, alternative="two-sided").pvalue

def get_p_values_data(df, *, cat_col, val_col, base_dy, step):
    from itertools import combinations
    categories = df[cat_col].unique().tolist()
    rows = []
    for i, (xmin, xmax) in enumerate(combinations(categories, 2)):
        y = base_dy + i * step
        p = get_p_value(df, cat_col, val_col, xmin, xmax)
        rows.append({"y": y, "p": p,
                     "xmin": xmin,
                     "xmax": xmax})
    return pd.DataFrame(rows)
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/mpg.csv")
print(df.shape)
df.head()
(234, 12)
Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy fl class
0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
3 4 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
4 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact

Adding Significance Bars (p-values) to Categorical Plots#

Compute Significance Data#

p_values_df = get_p_values_data(df, cat_col="drv", val_col="hwy", base_dy=47, step=4)
print(p_values_df.shape)
p_values_df
(3, 4)
y p xmin xmax
0 47 9.041090e-28 f 4
1 51 5.955333e-11 f r
2 55 4.104577e-02 4 r
p = ggplot(df, aes("drv", "hwy", fill="drv")) + \
    geom_boxplot(alpha=.25) + \
    geom_jitter(aes(color="drv"), height=0, shape=1, alpha=.25, show_legend=False, seed=42)

Basic p-value Annotations#

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="p"), 
                 data=p_values_df  # <-- Pass p-values data to the brackets layer
                )

Format Labels with label_format#

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="p"), data=p_values_df, label_format=".2~g")

Format Labels with label_pvalue() from mizani#

Mizani is a scales package for graphics: https://pypi.org/project/mizani/

from mizani.labels import label_pvalue

formatter = label_pvalue(add_p=True)
p_values_df = p_values_df.assign(label=lambda d: formatter(d["p"]))

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label"), data=p_values_df)

Apply Custom Formatting Logic (Significance Stars)#

Map p-values to custom strings (like *** or ns) using your own function.

def stars_formatter(value):
    if value <= 0.001:
        return "***"
    if value <= 0.01:
        return "**"
    if value <= 0.05:
        return "*"
    return "ns"

p_values_df = p_values_df.assign(star=lambda d: d["p"].map(stars_formatter))

p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="star"), data=p_values_df)

Annotating Dodged Groups#

To draw brackets between dodged groups, use geom_bracket_dodge(). Instead of using data coordinates, this layer uses group indices:

  • x: The main category on the x-axis.

  • istart, iend: The zero-based indices of the dodged groups to compare.

# Group indexes must be specified in the same order as they appear on the plot for other layers
group_indices = {4.0: 0, 6.0: 1, 8.0: 2, 5.0: 3}
p_values_grouped_df = pd.concat([
    get_p_values_data(df[df["class"] == name], cat_col="cyl", val_col="hwy", base_dy=47, step=4).assign(
        x=name,
        star=lambda d: d["p"].map(stars_formatter) if d.shape[0] > 0 else None,
        start=lambda d: d["xmin"].map(group_indices) if d.shape[0] > 0 else None,
        end=lambda d: d["xmax"].map(group_indices) if d.shape[0] > 0 else None,
    )
    for name in df["class"].unique()
]).reset_index(drop=True)

p_values_grouped_df.head()
y p xmin xmax x star start end
0 47.0 0.000025 4.0 6.0 compact *** 0 1
1 51.0 0.910367 4.0 5.0 compact ns 0 3
2 55.0 0.030158 6.0 5.0 compact * 1 3
3 47.0 0.030194 6.0 8.0 midsize * 1 2
4 51.0 0.000009 6.0 4.0 midsize *** 1 0
(ggplot(df, aes("class", "hwy", fill=as_discrete("cyl")))
  + geom_boxplot(alpha=.25)
  + geom_point(aes(color=as_discrete("cyl")),
               position=position_jitterdodge(jitter_width=.2, jitter_height=0),
               shape=1, alpha=.25, show_legend=False, seed=42)
  + geom_bracket_dodge(aes("x", "y", istart="start", iend="end", label="star"), data=p_values_grouped_df)
  + scale_brewer(["color", "fill"], palette="Accent", format="d")
  + ggsize(1000, 500))

Customizing Bracket Geometry#

Bottom-Axis (Upside Down) Annotations#

To place brackets at the bottom of a plot, use negative values for lenstart and lenend.
You may also need to adjust vjust to position the labels correctly outside the bracket.

upside_down_p_values_df = get_p_values_data(df, cat_col="drv", val_col="hwy", base_dy=8, step=-4).assign(
    label=lambda d: formatter(d["p"])
)
print(upside_down_p_values_df.shape)
upside_down_p_values_df
(3, 5)
y p xmin xmax label
0 8 9.041090e-28 f 4 p<0.001
1 4 5.955333e-11 f r p<0.001
2 0 4.104577e-02 4 r p=0.041
(p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label"), data=upside_down_p_values_df,
                  lenstart=-5, lenend=-5,  # Negative values to reverse the direction of the brackets
                  vjust=2))                # Specify vjust to move the labels under the brackets

Precise Alignment with tiplength_unit='identity'#

By default, bracket tips use a fixed length. Setting tiplength_unit='identity' allows you to specify tip lengths in data units.
This is essential when you want a bracket tip to reach a specific coordinate, such as the exact top of a boxplot or a particular data point.

# Let's recall the y-coordinates from the datasets
print(f"Upper limits of boxes for each category:\n{df.groupby('drv')['hwy'].max().to_dict()}\n")
print("Dataset with p-values:")
p_values_df
Upper limits of boxes for each category:
{'4': 28, 'f': 44, 'r': 26}

Dataset with p-values:
y p xmin xmax label star
0 47 9.041090e-28 f 4 p<0.001 ***
1 51 5.955333e-11 f r p<0.001 ***
2 55 4.104577e-02 4 r p=0.041 *
# The tip lengths are chosen so that each tip reaches either the previous bracket or the box
# Example (first bracket):
#   lenstart = y - upper_box_limit_at_xmin - 1
#     -> 47 - 44 - 1 = 2
#   lenend   = y - upper_box_limit_at_xmax - 1
#     -> 47 - 28 - 1 = 18
(p + geom_bracket(aes(xmin="xmin", xmax="xmax", y="y", label="label",
                      lenstart=[2, 1, 1],
                      lenend=[18, 24, 1]),
                  data=p_values_df,
                  tiplength_unit='identity'))  # identity units (data space)

Non-Statistical Annotations: Range Grouping#

Brackets can be used for more than just drawing p-values. For example, they can be used to highlight cluster boundaries in the following scatter plot:

def get_continuous_brackets_data(df, cat_col, primary_val_col, secondary_val_col, step_mult):
    secondary_min, secondary_max = df[secondary_val_col].min(), df[secondary_val_col].max()
    step = (secondary_max - secondary_min) * step_mult
    base = secondary_max + step
    df = df.copy()
    df["drv"] = pd.Categorical(df["drv"], categories=["r", "4", "f"], ordered=True)
    df = df.sort_values(by="drv")
    return pd.merge(
        df.groupby(cat_col, observed=False)[primary_val_col].min().to_frame(f"min_{primary_val_col}").reset_index(),
        df.groupby(cat_col, observed=False)[primary_val_col].max().to_frame(f"max_{primary_val_col}").reset_index(),
        on="drv"
    ).reset_index().assign(**{f"{secondary_val_col}_level": lambda d: base + d["index"] * step})

(ggplot()
  + geom_point(aes("hwy", "cty", color="drv"), data=df, alpha=.25, show_legend=False)

    # Horizontal brackets for hwy ranges
  + geom_bracket(aes(xmin="min_hwy", xmax="max_hwy", y="cty_level", label="drv", color="drv"),
                 data=get_continuous_brackets_data(df, "drv", "hwy", "cty", .1))
 
    # Vertical brackets for cty ranges
  + geom_bracket(aes(x="hwy_level", ymin="min_cty", ymax="max_cty", label="drv", color="drv"),
                 data=get_continuous_brackets_data(df, "drv", "cty", "hwy", .05))
  + theme_minimal() + labs(x='hwy', y='cty'))