Download notebook (.ipynb)

MPG Correlogram#

A correlogram provides a quick overview of the entire dataset and allows analysing the relationship between each pair of numerical variables.

import numpy as np
import pandas as pd

from lets_plot import *
LetsPlot.setup_html()
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
print(df.shape)
df.head()
(234, 11)
manufacturer model displ year cyl trans drv cty hwy fl class
0 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
1 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
2 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
3 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
4 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
corr_df = df.corr(numeric_only=True).stack().to_frame().reset_index()
corr_df.columns = ["x", "y", "corr"]
corr_df.head()
x y corr
0 displ displ 1.000000
1 displ year 0.147843
2 displ cyl 0.930227
3 displ cty -0.798524
4 displ hwy -0.766020
corr_df0 = corr_df[corr_df["x"] == corr_df["y"]]

corr_df1 = corr_df[corr_df["x"] < corr_df["y"]]
corr_df1 = pd.concat([
    corr_df1.assign(half="corr"),
    corr_df1.assign(corr=np.where(corr_df1["corr"] > 0, 1 - corr_df1["corr"], -1 - corr_df1["corr"]), half="remainder")
]).reset_index(drop=True)

corr_df2 = corr_df[corr_df["x"] > corr_df["y"]]
corr_df2 = corr_df2.assign(angle=np.where(corr_df2["corr"] > 0, np.pi / 4, 3 * np.pi / 4))

vars = sorted(corr_df["x"].unique())

The Correlogram#

Let’s plot a correlogram of the mpg dataset variables. Here’s how it should be interpreted:

  • The filled portion of the pie shows the correlation magnitude.

  • The diagonal of the squares shows the sign of the correlation.

  • The depth of the figures shading shows the correlation magnitude.

  • The names of the variables are on the diagonal.

ggplot() + \
    geom_pie(aes("x", "y", slice="corr", paint_a="corr"), \
             data=corr_df1, stat='identity', \
             size=1, size_unit='x', spacer_width=1, \
             fill_by='paint_a', tooltips='none') + \
    geom_pie(aes("x", "y", slice="corr", paint_b="half"), \
             data=corr_df1, stat='identity', \
             size=1, size_unit='x', show_legend=False, \
             fill_by='paint_b', tooltips='none') + \
    geom_point(aes("x", "y", paint_a="corr"), \
               data=corr_df2, shape=22, size=1, stroke=1, size_unit='x', \
               color="white", fill_by='paint_a', tooltips='none') + \
    geom_spoke(aes("x", "y", angle="angle"), \
               radius=np.sqrt(2), data=corr_df2, \
               pivot='middle', size=1, color="white") + \
    geom_text(aes("x", "y", label="x"), \
              data=corr_df0, size=1, size_unit='x') + \
    scale_x_discrete(breaks=vars, expand=[.1, 0]) + scale_y_discrete(breaks=vars, expand=[.1, 0]) + \
    scale_gradient2('paint_a', low="#a50026", mid="white", high="#313695") + \
    scale_manual('paint_b', values=["rgba(0, 0, 0, 0)", "lightgrey"]) + \
    coord_fixed() + \
    ggsize(660, 600) + \
    theme_void()