Clustering with the timeouts

Contents

Clustering with the timeouts#

Previously we did the clustering without considering the unfinished paths due to timeouts. So we will now redo the clustering while taking them into account.

import pandas as pd
import numpy as np
import re
from pathlib import Path

# dimensionality reduction
from sklearn.decomposition import PCA
from umap import umap_ as UMAP

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

# clustering
from leiden_clustering import LeidenClustering
import utils

DATA_PATH = Path.cwd() / "../data"
FIG_PATH = Path.cwd() / "../src"
FILTER_TIMEOUT = False
if FILTER_TIMEOUT:
    FIG_PATH = FIG_PATH / "figures/leiden/"
else:
    FIG_PATH = FIG_PATH / "figures/leiden_with_timeout/"
FIG_PATH = FIG_PATH.resolve()
print(f"Figure path: {FIG_PATH}")
data = {
    Path(f).stem: pd.read_csv(f, index_col=0) for f in DATA_PATH.glob("combined_*.csv")
}
print(list(data.keys()))
if not FIG_PATH.exists():
    FIG_PATH.mkdir(parents=True)

Figure path: C:\Users\Cyril\Desktop\Code\ada-2023-project-adamants\src\figures\leiden_with_timeout
['combined_metrics_finished_edges', 'combined_metrics_finished_paths', 'combined_metrics_unfinished_edges', 'combined_metrics_unfinished_paths']

features_finished_paths = data["combined_metrics_finished_paths"].reset_index(drop=True)
features_unfinished_paths = data["combined_metrics_unfinished_paths"].reset_index(
    drop=True
)

combined_df = pd.concat([features_finished_paths, features_unfinished_paths], axis=0)
combined_df["finished"] = [1] * len(features_finished_paths) + [0] * len(
    features_unfinished_paths
)
combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING] = utils.normalize_features(
    combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING]
)

combined_df.dropna(subset=utils.FEATURES_COLS_USED_FOR_CLUSTERING, inplace=True)
X = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().values

#fix seed for reproducibility
np.random.seed(2)
clustering = LeidenClustering(
    leiden_kws={"n_iterations": -1, "seed": 0, "resolution_parameter": 0.2},
    pca_kws={"n_components": 7},
)
clustering.fit(X)
clustering.labels_

c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:346: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:348: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:358: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")

array([0, 5, 0, ..., 3, 3, 4])

# UMAP
umap = UMAP.UMAP(n_components=3, metric="euclidean")
result_umap_euc = umap.fit_transform(X)

c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:346: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:348: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:358: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")

UMAP plot of the Leiden clustering#

fig = px.scatter_3d(
    result_umap_euc,
    x=0,
    y=1,
    z=2,
    color=clustering.labels_.astype(str),
    category_orders={"color": [str(i) for i in range(0, len(np.unique(clustering.labels_)))]},
    title="UMAP, clustering by leiden algorithm",
    # reduce size points
    size_max=0.1,
)
fig.update_layout({"plot_bgcolor": "#14181e", "paper_bgcolor": "#14181e"})
fig.update_layout(font_color="white")
fig.update_layout(scene=dict(xaxis=dict(showticklabels=False), yaxis=dict(showticklabels=False), zaxis=dict(showticklabels=False)))
fig.update_layout(legend_title_text="Cluster")
fig.update_layout(legend = dict(bgcolor = 'rgba(0,0,0,0)'))
fig.update_layout(scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"))
fig.update_layout(scene = dict(
    xaxis = dict(
            backgroundcolor="rgba(0, 0, 0,0)",
            # gridcolor="rgba(0, 0, 0,0)", # gridcolor is for logo
            showbackground=True,
            zerolinecolor="white",),
    yaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white"),
    zaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white",),),
)

display(fig)

We see that when considering timeouts, a new cluster adds itself ! Now we need to check the distribution of the features to see if this new cluster is related to the timeouts.

Features distributions across clusters#

feat_labels = utils.get_feature_names_labels()

# plot histogram features colored by cluster
with plt.style.context("dark_background"):
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().dropna()
    plot_data["cluster"] = clustering.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.histplot(
            data=plot_data,
            x=col,
            hue="cluster",
            ax=axs[i],
            stat="density",
            common_norm=False,
            palette="tab10",
        )
        utils.set_axis_style(axs, i)

    plt.suptitle("Histograms of normalized features colored by cluster", fontsize=20)
    plt.subplots_adjust(top=0.95)
    plt.show()

_images/c61bc6f10c761bc4dd50fd51d82f1187187354736eab2ac856367d11bea53782.png

As is visible in the durationinSec plot, the third cluster (in red) concentrates all the longest times taken; this is also visible in coarse_mean_time.

# plot the boxplots of features colored by cluster
with plt.style.context("dark_background"):
    combined_df["finished_normalized"] = (
        combined_df["finished"] - combined_df["finished"].mean()
    ) / combined_df["finished"].std()
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING + ["finished_normalized"]].copy()
    plot_data["cluster"] = clustering.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 4
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    fig.suptitle("Boxplots of normalized features colored by cluster")
    plt.subplots_adjust(top=0.95)
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.boxplot(data=plot_data, x="cluster", y=col, ax=axs[i], hue="cluster", palette="tab10")
        utils.set_axis_style(axs, i)
    plt.show()

_images/0c79c8a259ae638bfbdb3dc94eea353c10201a6b68f699cc3686a50d83502577.png

Heatmap of feature means for each cluster#

# heatmap of features means per cluster
with plt.style.context("dark_background"):
    means = plot_data.groupby(plot_data["cluster"]).mean()
    sns.heatmap(means, cmap="coolwarm", annot=True, fmt=".1f", vmin=-1, vmax=1)
    plt.title("Heatmap of normalized features means per cluster")
    plt.ylabel("Cluster")
    plt.xlabel("Feature")
    plt.xticks(labels=feat_labels+["Finished (norm)"], ticks=np.arange(len(feat_labels)+1)+0.5, rotation=90)
    plt.show()

_images/13f413f72f1ab4fd250fc7372b9a0a9059d7899d895f886885a143aa7a4a9cf7.png

Number of paths per cluster#

clusters_names,n_points = np.unique(plot_data["cluster"], return_counts=True)
with plt.style.context("dark_background"):
    sns.barplot(x=clusters_names, y=n_points,hue=range(len(clusters_names)), palette="tab10").set_title("Number of paths per cluster")
    # make background grey
    plt.gca().patch.set_facecolor("#d3d3d3")
    # update legend style
    leg = plt.gca().get_legend()
    leg.get_frame().set_facecolor("#d3d3d3")
    leg.set_title("Cluster")
    leg_title = leg.get_title()
    leg_title.set_color("black")
    # change all legend text to black
    [text.set_color("black") for text in leg.get_texts()]
    plt.xlabel("Cluster")
    plt.ylabel("Number of paths")
plt.show()

_images/509e1ab9790c49bdc75d42e08767ea063c3cf4890c380ad40995aba83ed1141c.png

Number of timeouts per cluster#

if not FILTER_TIMEOUT:
    # number of timeouts per cluster
    with plt.style.context("dark_background"):
        plot_data["timeout"] = combined_df["type"] == "timeout"
        timeout_per_cluster = plot_data.groupby(plot_data["cluster"])["timeout"].sum()
        sns.barplot(x=clusters_names,
                    y=timeout_per_cluster,
                    hue=range(len(clusters_names)),
                    palette="tab10",
                    ).set_title(
            "Number of timeouts per cluster"
        )
        plt.gca().patch.set_facecolor("#d3d3d3")
        leg = plt.gca().get_legend()
        leg.get_frame().set_facecolor("#d3d3d3")
        leg.set_title("Cluster")
        leg_title = leg.get_title()
        leg_title.set_color("black")
        [text.set_color("black") for text in leg.get_texts()]
        plt.xlabel("Cluster")
        plt.ylabel("Number of timeouts")
    plt.show()

_images/97e4fc57df2d4d09ee8ba3119c0ac9896542d722d48e5413644e9f8b5c8752d7.png

Again, we can here see that most timeouts are in the third cluster.