Download notebook (.ipynb)

Lets-Plot in 2024#

from datetime import datetime

import numpy as np
import pandas as pd

from lets_plot import *
from lets_plot.bistro import *
from lets_plot.geo_data import *
The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).
LetsPlot.setup_html()
LetsPlot.set_theme(theme(plot_title=element_text(face='bold')))
DATA_ROOT_DIR = "https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/lp2024"

Get data#

authors_from_bar = ["Ivan Kupriyanov", "Artem Smirnov", "Valentin Dovidaytis", "Rashid Yangazov"]
ext_replaces = {"kt": "kotlin", "py": "python", "ipynb": "demo", "csv": "demo"}
commits_df = pd.read_csv("{0}/generated/commits.csv".format(DATA_ROOT_DIR), sep=";", parse_dates=["date"])
commits_df["deletions_neg"] = -commits_df["deletions"]
commits_df["author_city"] = np.where(commits_df["author"].isin(authors_from_bar), "Bar", "Other")
commits_df["file_kind"] = commits_df["extension"].map(ext_replaces).fillna("other")
print(commits_df.shape)
commits_df.head()
(1537, 10)
hash repo_name author date extension additions deletions deletions_neg author_city file_kind
0 27fd1bba42290374415c80e7b85a23479617e621 lets-plot Valentin Dovidaytis 2024-12-26 12:12:18+01:00 kts 104.0 185.0 -185.0 Bar other
1 471d5a62377a3dbf92859a28b0d6e1598cb8646e lets-plot Valentin Dovidaytis 2024-12-24 16:21:31+01:00 kts 87.0 18.0 -18.0 Bar other
2 8f20cf850a0a02c0c7b707fd45667e64402156d3 lets-plot Ivan Kupriyanov 2024-12-23 21:17:01+01:00 kts 40.0 50.0 -50.0 Bar other
3 8f20cf850a0a02c0c7b707fd45667e64402156d3 lets-plot Ivan Kupriyanov 2024-12-23 21:17:01+01:00 kt 1.0 11.0 -11.0 Bar kotlin
4 325ae3b2ba27dcaeeb33364f2f537ec3f5ca0a96 lets-plot Ivan Kupriyanov 2024-12-23 19:45:12+01:00 kts 8.0 0.0 -0.0 Bar other
files_df = pd.read_csv("{0}/generated/files.csv".format(DATA_ROOT_DIR))
print(files_df.shape)
files_df.head()
(1864, 6)
repo_name file_path file_name extension additions deletions
0 lets-plot docs/dev/notebooks/coord_polar_wind.csv coord_polar_wind.csv csv 1147261.0 0.0
1 lets-plot docs/dev/notebooks/interactive_tools.ipynb interactive_tools.ipynb ipynb 28830.0 724.0
2 lets-plot docs/dev/notebooks/waterfall.ipynb waterfall.ipynb ipynb 22593.0 6894.0
3 lets-plot docs/dev/notebooks/coord_polar.ipynb coord_polar.ipynb ipynb 17072.0 11793.0
4 lets-plot plot-stem/src/commonMain/kotlin/org/jetbrains/... SeattleWeather.kt kt 11701.0 0.0
releases_df = pd.read_csv("{0}/original/releases.csv".format(DATA_ROOT_DIR), parse_dates=["date"])
releases_df["total"] = releases_df["added"] + releases_df["changed"] + releases_df["fixed"]
releases_df = releases_df.sort_values(by=["repo_name", "date"]).reset_index(drop=True)
print(releases_df.shape)
releases_df.head()
(25, 11)
hash repo_name version release_kind date added changed fixed repo_link demo_link total
0 523aeb73c2b733839070b14b29af60e638b77f5b lets-plot 4.2.0 minor 2023-12-28 14:09:55+01:00 12 2 11 https://github.com/JetBrains/lets-plot/release... https://nbviewer.org/github/JetBrains/lets-plo... 25
1 09484c78d9c9873598bd24afde294d1180afc59a lets-plot 4.3.0 minor 2024-03-07 17:57:33+01:00 6 2 14 https://github.com/JetBrains/lets-plot/release... https://nbviewer.org/github/JetBrains/lets-plo... 22
2 a3233e9cdbe6b8474a90d7443c20564de16e3cfb lets-plot 4.3.1 patch 2024-04-16 16:09:51+02:00 7 0 25 https://github.com/JetBrains/lets-plot/release... https://nbviewer.org/github/JetBrains/lets-plo... 32
3 de1b96cc5120485112a6f6457f6351e5e0131fed lets-plot 4.3.2 patch 2024-04-25 16:48:41+02:00 0 1 4 https://github.com/JetBrains/lets-plot/release... https://nbviewer.org/github/JetBrains/lets-plo... 5
4 0f28341f1daf1bc8c263f2412894b6e8e4d3e73e lets-plot 4.3.3 patch 2024-05-29 18:14:19+02:00 1 1 5 https://github.com/JetBrains/lets-plot/release... https://nbviewer.org/github/JetBrains/lets-plo... 7
issues_df = pd.read_csv("{0}/generated/issues.csv".format(DATA_ROOT_DIR), sep=";", encoding_errors='replace', \
                        parse_dates=["created_at", "closed_at"])
issues_df = issues_df.sort_values(by=["repo_name", "number"]).reset_index(drop=True)
print(issues_df.shape)
issues_df.head()
(769, 8)
number state author title created_at closed closed_at repo_name
0 11 CLOSED Igor Alshannikov style class `plt-tooltip` is never set 2019-08-07 21:14:10+00:00 True 2019-08-19 12:14:47+00:00 lets-plot
1 49 CLOSED Igor Alshannikov BoxPlotSceneMapper demo: error on mouse hover 2019-10-29 20:57:39+00:00 True 2019-10-31 06:43:16+00:00 lets-plot
2 50 CLOSED Igor Alshannikov BarPlotSceneMapper demo: tooltip should appear... 2019-10-29 21:55:24+00:00 True 2019-10-30 21:43:22+00:00 lets-plot
3 51 CLOSED Igor Alshannikov Implement GGBunch 2019-10-30 23:02:27+00:00 True 2020-01-27 16:04:08+00:00 lets-plot
4 52 CLOSED Igor Alshannikov Implement guides function 2019-11-01 17:14:49+00:00 True 2021-03-17 13:41:21+00:00 lets-plot
lpd_performance_countries_df = pd.read_csv("{0}/original/lpd_performance/Countries.csv".format(DATA_ROOT_DIR))
lpd_performance_countries_df["CTR"] = lpd_performance_countries_df["CTR"].str.replace("%", "").astype(float)
print(lpd_performance_countries_df.shape)
lpd_performance_countries_df.head()
(225, 5)
Country Clicks Impressions CTR Position
0 United States 4531 165681 2.73 15.63
1 Germany 1385 35946 3.85 11.95
2 United Kingdom 1248 31753 3.93 14.36
3 Russia 526 10099 5.21 24.63
4 France 420 22315 1.88 12.55
resolution = 3
countries_gdf = geocode_countries(lpd_performance_countries_df["Country"]).ignore_not_found().get_boundaries(resolution)
print(countries_gdf.shape)
countries_gdf.head()
(192, 3)
country found name geometry
0 United States United States MULTIPOLYGON (((-178.12091 51.67737, -177.9618...
1 Germany Germany MULTIPOLYGON (((13.16065 54.55901, 13.28192 54...
2 United Kingdom United Kingdom MULTIPOLYGON (((-7.32468 57.67791, -7.06259 57...
3 Russia Russia MULTIPOLYGON (((-179.07499 71.59623, -179.3849...
4 France France MULTIPOLYGON (((-151.35269 -16.84600, -151.449...
lpd_performance_dates_df = pd.read_csv("{0}/original/lpd_performance/Dates.csv".format(DATA_ROOT_DIR), parse_dates=["Date"])
lpd_performance_dates_df["CTR"] = lpd_performance_dates_df["CTR"].str.replace("%", "").astype(float)
print(lpd_performance_dates_df.shape)
lpd_performance_dates_df.head()
(339, 5)
Date Clicks Impressions CTR Position
0 2024-12-18 75 2579 2.91 11.23
1 2024-12-17 59 2775 2.13 10.81
2 2024-12-16 106 2978 3.56 11.35
3 2024-12-15 24 1437 1.67 14.05
4 2024-12-14 19 1091 1.74 15.61
lpd_performance_pages_df = pd.read_csv("{0}/original/lpd_performance/Pages.csv".format(DATA_ROOT_DIR))
lpd_performance_pages_df["CTR"] = lpd_performance_pages_df["CTR"].str.replace("%", "").astype(float)
print(lpd_performance_pages_df.shape)
lpd_performance_pages_df.head()
(873, 5)
Top pages Clicks Impressions CTR Position
0 https://lets-plot.org/ 3696 45001 8.21 36.95
1 https://lets-plot.org/pages/charts.html 1202 24334 4.94 21.14
2 https://lets-plot.org/python/pages/api.html 690 34801 1.98 22.85
3 https://lets-plot.org/python/pages/charts.html 440 10262 4.29 24.11
4 https://lets-plot.org/python/pages/api/lets_pl... 267 2980 8.96 22.80
lpd_performance_queries_df = pd.read_csv("{0}/original/lpd_performance/Queries.csv".format(DATA_ROOT_DIR))
lpd_performance_queries_df["CTR"] = lpd_performance_queries_df["CTR"].str.replace("%", "").astype(float)
print(lpd_performance_queries_df.shape)
lpd_performance_queries_df.head()
(1000, 5)
Top queries Clicks Impressions CTR Position
0 lets plot 700 1230 56.91 1.00
1 letsplot 487 902 53.99 1.00
2 lets-plot 476 984 48.37 1.00
3 lets plot python 305 476 64.08 1.01
4 lets_plot 122 196 62.24 1.00

Plots: repositories#

def get_release_tooltip():
    return layer_tooltips()\
           .title("<a href=\"@repo_link\">@repo_name v@version (@release_kind)</a>")\
           .line("release date|@date").format("@date", "%d %b, %l:%M %p")\
           .line("total number of updates\n= added features + changes + fixes|@total = @added + @changed + @fixed")\
           .line("demo notebooks|<a href=\"@demo_link\">NBViewer</a>")
def get_commits_tooltip(kind):
    return layer_tooltips().title("@..count.. {0}".format(kind))\
                           .line("@|@file_kind")\
                           .line("date|@date").format("@date", "%d %b")

background_data = {
    'ymin': [-3_000, 0, 3_000],
    'ymax': [0, 3_000, 3_500],
    'area_kind': ["deletions", "additions", "releases"],
}
background_labels_data = {
    'x': [datetime(2024, 12, 15)] * 2,
    'y': [-2_000, 2_000],
    'label': ["Deletions", "Additions"],
}

area_colors = {
    "additions": "#b2df8a",
    "deletions": "#fb9a99",
    "releases": "black",
}
file_colors = {
    "kotlin": "#af1df5",
    "python": "#306998",
    "other": "gray",
}
release_colors = {
    "major": "#f03b20",
    "minor": "#fd8d3c",
    "patch": "#ffffb2",
}

release_shapes = {
    "lets-plot": 22,
    "lets-plot-kotlin": 23,
    "lets-plot-skia": 21
}

commits_filtered_df = commits_df[commits_df["file_kind"] != "demo"]

ggplot() + \
    geom_band(aes(ymin='ymin', ymax='ymax', paint_a='area_kind'), \
              data=background_data, size=0, alpha=.75, fill_by="paint_a", tooltips='none') + \
    geom_histogram(aes(x="date", y="..count..", weight="additions", paint_b="file_kind"), \
                   data=commits_filtered_df, bins=61, boundary=datetime(2024, 1, 1), fill_by="paint_b", \
                   tooltips=get_commits_tooltip("additions")) + \
    geom_histogram(aes(x="date", y="..count..", weight="deletions_neg", paint_b="file_kind"), \
                   data=commits_filtered_df, bins=61, boundary=datetime(2024, 1, 1), fill_by="paint_b", \
                   tooltips=get_commits_tooltip("deletions")) + \
    geom_text(aes('x', 'y', label='label'), data=background_labels_data) + \
    geom_point(aes(x="date", size="total", paint_c="release_kind", shape="repo_name"), \
               y=3_250, data=releases_df, fill_by="paint_c",
               tooltips=get_release_tooltip()) + \
    coord_polar(ylim=[-3_000, 3_500]) + \
    scale_x_datetime(breaks=[datetime(2024, month, 2) for month in range(1, 13)], \
                     limits=[datetime(2024, 1, 1), datetime(2024, 12, 31)], expand=[0, 0], format="%b") + \
    scale_y_continuous(expand=[0, 0]) + \
    scale_manual("paint_a", values=area_colors, guide='none') + \
    scale_manual("paint_b", name="file type", values=file_colors) + \
    scale_manual("paint_c", name="release kind", values=release_colors, \
                 guide=guide_legend(override_aes={"shape": 21})) + \
    scale_size(range=[3, 9], guide='none') + \
    scale_shape_manual(name="repository", values=release_shapes) + \
    ggsize(1000, 800) + \
    ggtitle("Commits in 2024") + \
    theme(axis_line_x='blank', axis_ticks_x='blank', axis_title_x='blank',
          axis_text_y='blank', axis_title_y='blank',
          panel_grid_major_x=element_line(color='gray'),
          panel_grid_major_y='blank')

In 2024, the highest surge in Kotlin code-writing occurred in late September. The biggest updates in Python came in the middle of summer.

def get_waterfall_data():
    df = commits_df[commits_df["file_kind"] != "demo"].sort_values(by="date").reset_index(drop=True)
    date_series = pd.to_datetime(df["date"], utc=True)
    df["month_number"] = date_series.dt.month
    df["month"] = date_series.dt.month_name()
    grouped_df = df.groupby(["month_number", "month"]).agg({"additions": ["sum"], "deletions_neg": ["sum"]})
    grouped_df.columns = ["additions", "deletions"]
    melted_df = pd.melt(
        grouped_df.reset_index(),
        id_vars=["month_number", "month"],
        value_vars=["additions", "deletions"],
        var_name="changes_type",
        value_name="changes_size",
    )
    return melted_df.sort_values(by=["month_number", "changes_type"]).reset_index(drop=True)

waterfall_df = get_waterfall_data()

breaks = [i + .5 for i in range(0, 23, 2)] + [24]
labels = list(waterfall_df["month"].unique()) + ["Total"]

waterfall_plot(get_waterfall_data(), "month", "changes_size", \
               label='blank', show_legend=True, size=.75, width=.75,
               absolute_tooltips=layer_tooltips()
                                .title("@..xlabel..")
                                .line("@..dy..")
                                .disable_splitting(),
               relative_tooltips=layer_tooltips()
                                .title("@..xlabel..")
                                .line("@..dy..")
                                .disable_splitting()) + \
    scale_x_continuous(name="Month", breaks=breaks, labels=labels) + \
    scale_fill_discrete(name="", labels={"Increase": "additions", "Decrease": "deletions"},
                        guide=guide_legend(override_aes={'size': 0})) + \
    ylab("Changes") + \
    ggsize(1000, 800) + \
    ggtitle("Total additions and deletions by month in all repositories")

Throughout the year, despite some deletions, the amount of code has steadily increased, especially in the fall.

def get_issues_data():
    actual_issues_df = issues_df[(issues_df["created_at"].dt.year == 2024)|(issues_df["closed_at"].dt.year == 2024)].reset_index(drop=True)
    actual_issues_df["duration"] = (actual_issues_df["closed_at"] - actual_issues_df["created_at"]).dt.days
    closed_actual_issues_df = actual_issues_df[actual_issues_df["closed"]].reset_index(drop=True)
    return (actual_issues_df, closed_actual_issues_df)

def get_issue_tooltip():
    return layer_tooltips().title("@repo_name\nIssue <a href=\"https://github.com/JetBrains/lets-plot/issues/@number\">#@number</a>").format("@number", "d")\
                           .line("@title")\
                           .line("author|@author")\
                           .line("created|@created_at").format("@created_at", "%Y-%m-%d, %l:%M %p")\
                           .line("closed|@closed_at").format("@closed_at", "%Y-%m-%d, %l:%M %p")\
                           .line("duration (days)|@duration").format("@duration", ".2~s")

actual_issues_df, closed_actual_issues_df = get_issues_data()

ggplot() + \
    geom_point(aes("created_at", y=as_discrete("number", order_by="created_at", order=1), shape="repo_name", paint_a="closed"), \
               data=actual_issues_df, size=2, fill_by="paint_a", tooltips=get_issue_tooltip()) + \
    geom_segment(aes(x="created_at", xend="closed_at", y="number", yend="number", paint_b="duration"), \
                 data=closed_actual_issues_df, arrow=arrow(angle=10), color_by="paint_b", \
                 tooltips=get_issue_tooltip()) + \
    scale_x_datetime(name="", breaks=[datetime(2024, month, 2) for month in range(1, 13)], format="%b") + \
    scale_y_continuous(format="#{d}") + \
    scale_manual("paint_a", name="issue is closed", values={True: "#4daf4a", False: "#e41a1c"}, \
                 guide=guide_legend(override_aes={'shape': 21})) + \
    scale_continuous("paint_b", name="duration (days)", low="#00441b", high="#c7e9c0") + \
    scale_shape_manual(name="repository", values=release_shapes) + \
    coord_cartesian(xlim=[datetime(2024, 1, 1), datetime(2024, 12, 31)]) + \
    theme(legend_position='bottom') + \
    ggtitle("Issues of 2024", "[Interactive plot]") + \
    ggsize(1000, 800) + \
    theme(axis_title_y='blank', axis_text_y='blank') + \
    ggtb()

A significant number of issues were resolved this year, including some longstanding ones.

Note that the plot above is interactive. Use the toolbox to zoom in and see more detail.

def get_files_top_data(top_size):
    df = files_df[files_df["extension"] == "kt"].copy()
    df["changes"] = df["additions"] + df["deletions"]
    df["deletions"] = -df["deletions"]
    df = df.sort_values(by="changes", ascending=False)
    filtered_df = df[
        (~df["extension"].isin(["csv", "ipynb"]))&
        (~df["file_path"].str.startswith("demo"))&
        (~df["file_path"].str.startswith("plot-stem"))&
        (~df["file_path"].str.endswith("Test.kt"))
    ]
    top_df = filtered_df.iloc[:top_size].reset_index(drop=True)
    top_df["full_path"] = top_df["repo_name"] + top_df["file_path"]
    return pd.melt(top_df,
                   id_vars=["repo_name", "full_path", "file_path", "file_name"],
                   value_vars=["additions", "deletions"],
                   var_name="change_type", value_name="change_size")

top_size = 20
files_top_df = get_files_top_data(top_size)
files_top_labels = files_top_df[["full_path", "file_name"]].set_index("full_path")["file_name"].to_dict()

ggplot(files_top_df, aes("full_path", "change_size", fill="change_type")) + \
    geom_bar(stat='identity', tooltips=layer_tooltips().title("@file_name")\
                                                       .line("changes|@change_size @change_type")\
                                                       .line("repository|@repo_name")\
                                                       .line("file path|@file_path")) + \
    geom_point(aes(shape="repo_name"), y=0, fill="white") + \
    scale_x_discrete(name="", labels=files_top_labels) + \
    scale_fill_manual(name="", values={"additions": "#4daf4a", "deletions": "#e41a1c"}) + \
    scale_shape_manual(name="repository", values=release_shapes) + \
    ylab("changes") + \
    ggsize(1000, 700) + \
    ggtitle("Top {0} files by number of changes per year".format(top_size))

Judging by the most heavily edited Kotlin files, 2024’s features such as the polar coordinate system, interactivity, and number formatting caused the most changes in the code.

Plots: documentation site#

All the plots in this section come from Google Search Console statistics. These statistics show how well the documentation site is doing.

Here’s a description of the data columns used:

  • Impressions are the total number of times your site’s link is shown in Google Search results. They reflect how visible your pages are to potential visitors: more impressions usually mean higher potential reach. Having a consistently growing number of impressions suggests increasing brand or content visibility.

  • Clicks indicate how many times users actually clicked through to your site from Google Search results. A higher click count generally means your site titles and snippets resonate with user intent. Working on relevant keywords and enticing descriptions can help you increase clicks.

  • (Average) Position shows your page’s typical ranking in search results, with 1 being the very top. The closer your average position is to 1, the more likely you are to attract clicks and traffic. Maintaining a low average position (near #1) usually correlates with higher visibility and engagement.

  • CTR (Click-Through Rate) is the percentage of impressions that result in a click, calculated by clicks ÷ impressions. A higher CTR means users find your listing appealing or relevant to their search query. Aim for a steadily improving CTR—benchmarks vary by niche, but for a documentation site, a value above 5% is considered a good CTR for organic search.

def get_countries_plot(col, palette, *, trans=None):
    return ggplot() + \
        geom_map(aes(fill=col), \
                 data=lpd_performance_countries_df, map=countries_gdf, \
                 map_join=["Country", "country"], \
                 size=.25, \
                 tooltips=layer_tooltips()\
                         .title("@{found name}")\
                         .line("@|@{0}{1}".format(col, "%" if col == "CTR" else ""))) + \
        scale_fill_brewer(type='seq', palette=palette, \
                          trans=trans, guide='none') + \
        ggtitle(col) + \
        theme_void()

gggrid([
    get_countries_plot("Clicks", "Blues", trans='log10'),
    get_countries_plot("Impressions", "Reds", trans='log10'),
    get_countries_plot("CTR", "Purples", trans='log10'),
    get_countries_plot("Position", "Greens", trans='reverse'),
], ncol=2) + \
    ggsize(1000, 800)

In the plots above, a darker color indicates a better value.

def get_releases_range_data():
    df = releases_df[releases_df["repo_name"] == "lets-plot"].copy()
    df["date_end"] = list(df["date"].iloc[1:]) + list(pd.to_datetime(["2025-01-01"]))
    return df

def get_releases_labels_data(y, dy):
    df = releases_df[releases_df["repo_name"] == "lets-plot"].iloc[1:].copy()
    df["x"] = df["date"] + pd.to_timedelta("4 days")
    df["y"] = np.where(df["version"] == "4.5.1", y + dy, y)
    return df

def get_dates_tooltips():
    return layer_tooltips().format("Date", "%d %b")

def get_dates_plot(col, color, *, y, dy, reverse_y_axis=False):
    return ggplot() + \
        geom_band(aes(xmin="date", xmax="date_end", paint_a="release_kind"), \
                  data=get_releases_range_data(), color_by='paint_a', fill_by='paint_a', alpha=.25, \
                  tooltips=get_release_tooltip().disable_splitting()) + \
        geom_text(aes("x", "y", label="version"), data=get_releases_labels_data(y, dy), angle=90) + \
        geom_line(aes("Date", col), data=lpd_performance_dates_df, color=color, tooltips=get_dates_tooltips()) + \
        geom_smooth(aes("Date", col), data=lpd_performance_dates_df, \
                    method='loess', level=.99, color="black", fill="cyan", alpha=.25, \
                    tooltips=get_dates_tooltips()) + \
        scale_x_datetime(name="", breaks=[datetime(2024, month, 2) for month in range(1, 13)],
                         expand=[0, 0], format="%b") + \
        scale_y_continuous(trans='reverse' if reverse_y_axis else None) + \
        scale_manual("paint_a", values=release_colors, guide='none') + \
        coord_cartesian(xlim=[datetime(2024, 1, 1), datetime(2024, 12, 31)]) + \
        ggtitle(col)

gggrid([
    get_dates_plot("Clicks", "#377eb8", y=230, dy=-40),
    get_dates_plot("Impressions", "#e41a1c", y=3_500, dy=-3_000),
    get_dates_plot("CTR", "#984ea3", y=19, dy=-3),
    get_dates_plot("Position", "#4daf4a", y=35, dy=-4, reverse_y_axis=True),
], ncol=1) + \
    ggsize(1000, 1600) + \
    theme(legend_position='bottom')

While the site’s CTR has dropped from around 5% to 2.5%, we’ve seen continued growth in both Impressions and Clicks. This likely indicates that the pages are showing up in more searches, including some less relevant ones, which boosts overall visibility and traffic but can dilute the overall click-through rate.